mirror of
https://github.com/n8n-io/n8n.git
synced 2026-05-12 16:10:30 +02:00
feat(ai-builder): Add per-PR eval regression detection vs LangSmith baseline (#29456)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5b01cba8b2
commit
bbe3e2d148
24
.github/workflows/test-evals-instance-ai.yml
vendored
24
.github/workflows/test-evals-instance-ai.yml
vendored
|
|
@ -143,7 +143,7 @@ jobs:
|
|||
--base-url "$BASE_URLS" \
|
||||
--concurrency 32 \
|
||||
--verbose \
|
||||
--iterations 3 \
|
||||
--iterations 5 \
|
||||
${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }}
|
||||
|
||||
- name: Stop n8n containers
|
||||
|
|
@ -160,22 +160,16 @@ jobs:
|
|||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
RESULTS_FILE="packages/@n8n/instance-ai/eval-results.json"
|
||||
if [ ! -f "$RESULTS_FILE" ]; then
|
||||
echo "No eval results file found"
|
||||
# The eval CLI writes the full PR comment as eval-pr-comment.md
|
||||
# (see comparison/format.ts:formatComparisonMarkdown). It includes
|
||||
# the alert, aggregate, comparison sections, per-test-case results
|
||||
# collapsed, and failure details collapsed. CI just relays it.
|
||||
COMMENT_FILE="packages/@n8n/instance-ai/eval-pr-comment.md"
|
||||
if [ ! -f "$COMMENT_FILE" ]; then
|
||||
echo "No PR comment file found (eval likely cancelled before writing results)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Build the full comment body with jq
|
||||
jq -r '
|
||||
"### Instance AI Workflow Eval Results\n\n" +
|
||||
"**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)% | iterations: \(.summary.passRatePerIter)**\n\n" +
|
||||
"| Workflow | Build | pass@\(.totalRuns) | pass^\(.totalRuns) |\n|---|---|---|---|\n" +
|
||||
([.testCases[] as $tc | "| \($tc.name) | \($tc.buildSuccessCount)/\($tc.totalRuns) | \(([$tc.scenarios[] | .passAtK] | add) / ($tc.scenarios | length) * 100 | floor)% | \(([$tc.scenarios[] | .passHatK] | add) / ($tc.scenarios | length) * 100 | floor)% |"] | join("\n")) +
|
||||
"\n\n<details><summary>Failure details</summary>\n\n" +
|
||||
([.testCases[] as $tc | $tc.scenarios[] | select(.passHatK < 1) | "**\($tc.name) / \(.name)** — \(.passCount)/\(.totalRuns) passed" + "\n" + ([.runs[] | select(.passed == false) | "> Run\(if .failureCategory then " [\(.failureCategory)]" else "" end): \(.reasoning | .[0:200])"] | join("\n"))] | join("\n\n")) +
|
||||
"\n</details>"
|
||||
' "$RESULTS_FILE" > /tmp/eval-comment.md
|
||||
cp "$COMMENT_FILE" /tmp/eval-comment.md
|
||||
|
||||
# Find and update existing eval comment, or create new one
|
||||
COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
|
||||
|
|
|
|||
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -36,6 +36,7 @@ packages/testing/playwright/playwright-report
|
|||
packages/testing/playwright/test-results
|
||||
packages/testing/playwright/eval-results.json
|
||||
packages/@n8n/instance-ai/eval-results.json
|
||||
packages/@n8n/instance-ai/eval-pr-comment.md
|
||||
packages/testing/playwright/.playwright-browsers
|
||||
packages/testing/playwright/.playwright-cli
|
||||
test-results/
|
||||
|
|
|
|||
|
|
@ -121,7 +121,7 @@ dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --iterations 3
|
|||
| `--base-url` | `http://localhost:5678` | n8n instance URL |
|
||||
| `--email` | E2E test owner | Override login email (or `N8N_EVAL_EMAIL`) |
|
||||
| `--password` | E2E test owner | Override login password (or `N8N_EVAL_PASSWORD`) |
|
||||
| `--timeout-ms` | `600000` | Per-test-case timeout |
|
||||
| `--timeout-ms` | `900000` | Per-test-case timeout |
|
||||
| `--output-dir` | cwd | Where to write `eval-results.json` |
|
||||
| `--dataset` | `instance-ai-workflow-evals` | LangSmith dataset name |
|
||||
| `--concurrency` | `16` | Max concurrent scenarios (builds are separately capped at 4) |
|
||||
|
|
@ -155,6 +155,47 @@ Every run produces:
|
|||
|
||||
**LangSmith caveat:** if `LANGSMITH_API_KEY` is set in `.env.local`, local runs also land in the shared `instance-ai-workflow-evals` dataset. Unset it (or run without `dotenvx`) to keep exploratory runs out of team results.
|
||||
|
||||
## Regression detection
|
||||
|
||||
When `LANGSMITH_API_KEY` is set, every eval run automatically compares its results against the most recent pinned baseline (any experiment whose name starts with `instance-ai-baseline-`). Two output files are written:
|
||||
|
||||
- `eval-results.json` — structured data only, including `comparison.result` when a baseline was found.
|
||||
- `eval-pr-comment.md` — the full PR comment rendered as markdown, including the alert, aggregate, comparison sections, per-test-case results, and failure details. Always written; falls back to a no-baseline summary when no comparison ran.
|
||||
|
||||
The CI PR-comment step uses `eval-pr-comment.md` as the entire comment body (no jq assembly in the workflow). The console output uses a separate aligned-text formatter — same data, no markdown noise in the terminal.
|
||||
|
||||
### Refreshing the baseline
|
||||
|
||||
There is no auto-refresh — refresh explicitly when you want a new reference point, ideally with high N for low noise:
|
||||
|
||||
```bash
|
||||
# From packages/@n8n/instance-ai/, on master at the version you want to pin
|
||||
LANGSMITH_API_KEY=... dotenvx run -f ../../../.env.local -- \
|
||||
pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
|
||||
```
|
||||
|
||||
LangSmith appends a random suffix (e.g. `instance-ai-baseline-7abc1234`); the most recently started one becomes the comparison target on the next eval run. The comparison is silently skipped on the baseline-creation run itself.
|
||||
|
||||
### How scenarios are tiered
|
||||
|
||||
Each scenario lands in one of three regression tiers, evaluated in order of strictness:
|
||||
|
||||
- **Regression** — high-confidence flag, gating-grade. The drop must be statistically significant (chance of seeing it by noise < 5%), at least 30 percentage points in size, and the baseline must have been reliable (≥ 70% pass rate).
|
||||
- **Soft regression** — looser bar for visibility on borderline cases. Looser confidence threshold (chance by noise < 20%), drop ≥ 15 percentage points, baseline ≥ 50%. Frequently natural variance — worth a glance only if your changes touch related code paths.
|
||||
- **Notable movement** — any scenario whose pass rate moved by ≥ 35 percentage points without reaching either flag tier. Pure visibility, no implication of cause.
|
||||
|
||||
Other verdicts: `improvement` (PR significantly better, skips the reliability gate), `unreliable_baseline` (confident drop but baseline was too flaky to call a regression — surfaced but not flagged), `stable`, `insufficient_data`.
|
||||
|
||||
Why these tiers and not a flat percentage threshold? At the small N PR runs use (typically 3 iterations), a flat threshold can't tell a real regression from coin-flip noise. The confidence cutoff filters out gaps that could plausibly happen by chance, and the reliability gate avoids chasing noise on already-flaky scenarios. Implementation lives in `comparison/statistics.ts` (Fisher's exact test for the confidence check, Wilson interval for the headline aggregate band). Tune the soft tier first if the false-positive rate looks off — keep the hard tier strict.
|
||||
|
||||
### Failure-category drift
|
||||
|
||||
When both sides captured per-trial `failureCategory` values, the comparison also surfaces a run-level table of category rates (PR vs baseline). A category is marked **notable** when its absolute rate delta is ≥ 5 percentage points _and_ the count change beyond what scenario-count scaling would predict is ≥ 3 trials. This catches cross-scenario shifts (e.g. mock-generation breaking, or a model getting weaker overall) that per-scenario flags can miss.
|
||||
|
||||
### Best-effort
|
||||
|
||||
Comparison is logged and skipped on any LangSmith failure — it never fails the eval. It is also skipped when no baseline experiment exists yet.
|
||||
|
||||
## Pairwise evals
|
||||
|
||||
Pairwise evals score a built workflow against the dataset's `dos` / `donts`
|
||||
|
|
|
|||
|
|
@ -0,0 +1,190 @@
|
|||
import { compareBuckets, type ExperimentBucket, type ScenarioCounts } from '../comparison/compare';
|
||||
|
||||
function bucket(
|
||||
name: string,
|
||||
scenarios: ScenarioCounts[],
|
||||
categories?: { totals: Record<string, number>; trialTotal: number },
|
||||
): ExperimentBucket {
|
||||
return {
|
||||
experimentName: name,
|
||||
scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
|
||||
failureCategoryTotals: categories?.totals,
|
||||
trialTotal: categories?.trialTotal,
|
||||
};
|
||||
}
|
||||
|
||||
function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
|
||||
return { testCaseFile: file, scenarioName: scenario, passed, total };
|
||||
}
|
||||
|
||||
describe('compareBuckets', () => {
|
||||
it('produces a clean intersection when both sides have the same scenarios', () => {
|
||||
const pr = bucket('pr', [s('contact', 'happy', 8, 10), s('weather', 'happy', 1, 10)]);
|
||||
const base = bucket('master', [s('contact', 'happy', 9, 10), s('weather', 'happy', 0, 10)]);
|
||||
|
||||
const result = compareBuckets(pr, base);
|
||||
|
||||
expect(result.scenarios).toHaveLength(2);
|
||||
expect(result.prOnly).toEqual([]);
|
||||
expect(result.baselineOnly).toEqual([]);
|
||||
expect(result.aggregate.intersectionSize).toBe(2);
|
||||
});
|
||||
|
||||
it('flags scenarios only present on one side', () => {
|
||||
const pr = bucket('pr', [s('contact', 'happy', 5, 10)]);
|
||||
const base = bucket('master', [s('contact', 'happy', 8, 10), s('weather', 'happy', 5, 10)]);
|
||||
|
||||
const result = compareBuckets(pr, base);
|
||||
|
||||
expect(result.scenarios).toHaveLength(1);
|
||||
expect(result.scenarios[0].testCaseFile).toBe('contact');
|
||||
expect(result.baselineOnly).toEqual([{ testCaseFile: 'weather', scenarioName: 'happy' }]);
|
||||
expect(result.prOnly).toEqual([]);
|
||||
});
|
||||
|
||||
it('aggregates only over the intersection, not over baseline-only or pr-only', () => {
|
||||
const pr = bucket('pr', [s('contact', 'happy', 10, 10)]);
|
||||
const base = bucket('master', [s('contact', 'happy', 5, 10), s('other', 'happy', 0, 10)]);
|
||||
|
||||
const result = compareBuckets(pr, base);
|
||||
|
||||
expect(result.aggregate.prAggregatePassRate).toBe(1);
|
||||
expect(result.aggregate.baselineAggregatePassRate).toBe(0.5);
|
||||
expect(result.aggregate.intersectionSize).toBe(1);
|
||||
});
|
||||
|
||||
it('sorts scenarios with regressions first, then improvements, then stable', () => {
|
||||
const pr = bucket('pr', [
|
||||
s('a', 'stable', 10, 10),
|
||||
s('b', 'regression', 0, 10),
|
||||
s('c', 'improvement', 10, 10),
|
||||
]);
|
||||
const base = bucket('master', [
|
||||
s('a', 'stable', 10, 10),
|
||||
s('b', 'regression', 10, 10),
|
||||
s('c', 'improvement', 0, 10),
|
||||
]);
|
||||
|
||||
const result = compareBuckets(pr, base);
|
||||
expect(result.scenarios.map((sc) => sc.scenarioName)).toEqual([
|
||||
'regression',
|
||||
'improvement',
|
||||
'stable',
|
||||
]);
|
||||
});
|
||||
|
||||
it('returns insufficient_data when one side has zero trials for a scenario', () => {
|
||||
const pr = bucket('pr', [s('contact', 'happy', 0, 0)]);
|
||||
const base = bucket('master', [s('contact', 'happy', 10, 10)]);
|
||||
|
||||
const result = compareBuckets(pr, base);
|
||||
expect(result.scenarios[0].verdict).toBe('insufficient_data');
|
||||
});
|
||||
|
||||
it('returns no failure-category drift when either side lacks category totals', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
|
||||
const base = bucket('master', [s('a', 'happy', 8, 10)]);
|
||||
expect(compareBuckets(pr, base).failureCategories).toEqual([]);
|
||||
});
|
||||
|
||||
it('flags a category as notable when both rate and trial-count gaps clear the bars', () => {
|
||||
// Haiku-style shift: framework_issue 0/290 → 9/145.
|
||||
// Rate gap: 6.2pp ≥ 5pp ✓. Expected PR count given baseline = 0 × (145/290) = 0; |9 − 0| = 9 ≥ 3 ✓.
|
||||
const pr = bucket('pr', [s('a', 'happy', 50, 145)], {
|
||||
totals: { framework_issue: 9 },
|
||||
trialTotal: 145,
|
||||
});
|
||||
const base = bucket('master', [s('a', 'happy', 200, 290)], {
|
||||
totals: { framework_issue: 0 },
|
||||
trialTotal: 290,
|
||||
});
|
||||
const cats = compareBuckets(pr, base).failureCategories;
|
||||
const fw = cats.find((c) => c.category === 'framework_issue');
|
||||
expect(fw?.notable).toBe(true);
|
||||
});
|
||||
|
||||
it('does not flag when the rate gap is below the 5pp bar', () => {
|
||||
// 3/100 vs 2/100 = 1pp gap, count gap = 1 — neither bar cleared.
|
||||
const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
|
||||
totals: { mock_issue: 3 },
|
||||
trialTotal: 100,
|
||||
});
|
||||
const base = bucket('master', [s('a', 'happy', 50, 100)], {
|
||||
totals: { mock_issue: 2 },
|
||||
trialTotal: 100,
|
||||
});
|
||||
const cats = compareBuckets(pr, base).failureCategories;
|
||||
expect(cats.find((c) => c.category === 'mock_issue')?.notable).toBe(false);
|
||||
});
|
||||
|
||||
it('does not flag when the rate gap is large but the count gap is tiny (small N guard)', () => {
|
||||
// PR 1/3 vs baseline 0/270 — rate gap = 33pp ≥ 5pp, but expected PR count = 0
|
||||
// and observed = 1, count gap = 1 < 3. Should NOT flag — single trial on small N.
|
||||
const pr = bucket('pr', [s('a', 'happy', 0, 3)], {
|
||||
totals: { builder_issue: 1 },
|
||||
trialTotal: 3,
|
||||
});
|
||||
const base = bucket('master', [s('a', 'happy', 270, 270)], {
|
||||
totals: { builder_issue: 0 },
|
||||
trialTotal: 270,
|
||||
});
|
||||
const cats = compareBuckets(pr, base).failureCategories;
|
||||
expect(cats.find((c) => c.category === 'builder_issue')?.notable).toBe(false);
|
||||
});
|
||||
|
||||
it('drops unknown categories with a console warning, keeps all known categories', () => {
|
||||
const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
|
||||
const pr = bucket('pr', [s('a', 'happy', 8, 10)], {
|
||||
totals: { '-': 5, builder_issue: 2 },
|
||||
trialTotal: 10,
|
||||
});
|
||||
const base = bucket('master', [s('a', 'happy', 8, 10)], {
|
||||
totals: { builder_issue: 1 },
|
||||
trialTotal: 10,
|
||||
});
|
||||
const cats = compareBuckets(pr, base).failureCategories;
|
||||
// All five known categories are always present (some at 0/0 — renderer
|
||||
// drops those). The unknown `-` category is dropped here with a warning.
|
||||
expect(cats.map((c) => c.category).sort()).toEqual([
|
||||
'build_failure',
|
||||
'builder_issue',
|
||||
'framework_issue',
|
||||
'mock_issue',
|
||||
'verification_failure',
|
||||
]);
|
||||
expect(warn).toHaveBeenCalledWith(expect.stringContaining('"-"'));
|
||||
warn.mockRestore();
|
||||
});
|
||||
|
||||
it('sorts notable categories before non-notable, then by absolute delta', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
|
||||
totals: { framework_issue: 10, mock_issue: 4, builder_issue: 25 },
|
||||
trialTotal: 100,
|
||||
});
|
||||
const base = bucket('master', [s('a', 'happy', 50, 100)], {
|
||||
totals: { framework_issue: 0, mock_issue: 3, builder_issue: 22 },
|
||||
trialTotal: 100,
|
||||
});
|
||||
const cats = compareBuckets(pr, base).failureCategories;
|
||||
// framework_issue is the only notable one (rate gap 10pp, count gap 10).
|
||||
expect(cats[0].category).toBe('framework_issue');
|
||||
expect(cats[0].notable).toBe(true);
|
||||
expect(cats.slice(1).every((c) => !c.notable)).toBe(true);
|
||||
});
|
||||
|
||||
it('accepts custom tiered thresholds for tests', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 5, 10)]);
|
||||
const base = bucket('master', [s('a', 'happy', 8, 10)]);
|
||||
|
||||
// Defaults: 5/10 vs 8/10 = -30pp drop, p ≈ 0.18 → soft_regression
|
||||
// (passes soft maxPValue=0.20, soft minDelta=0.15, baseline 80% above soft 50%).
|
||||
const defaults = compareBuckets(pr, base);
|
||||
expect(defaults.scenarios[0].verdict).toBe('soft_regression');
|
||||
|
||||
// Stricter soft p-value cutoff excludes this case.
|
||||
const stricter = compareBuckets(pr, base, {
|
||||
soft: { maxPValue: 0.1, minDelta: 0.15, minBaselinePassRate: 0.5 },
|
||||
});
|
||||
expect(['stable', 'watch']).toContain(stricter.scenarios[0].verdict);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,458 @@
|
|||
import {
|
||||
compareBuckets,
|
||||
type ComparisonOutcome,
|
||||
type ComparisonResult,
|
||||
type ExperimentBucket,
|
||||
type ScenarioCounts,
|
||||
} from '../comparison/compare';
|
||||
import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
|
||||
import type { MultiRunEvaluation, WorkflowTestCase, ScenarioResult } from '../types';
|
||||
|
||||
function ok(result: ComparisonResult): ComparisonOutcome {
|
||||
return { kind: 'ok', result };
|
||||
}
|
||||
|
||||
function slugMap(evaluation: MultiRunEvaluation, slugs: string[]): Map<WorkflowTestCase, string> {
|
||||
return new Map(evaluation.testCases.map((tc, i) => [tc.testCase, slugs[i] ?? 'unknown']));
|
||||
}
|
||||
|
||||
function bucket(name: string, scenarios: ScenarioCounts[]): ExperimentBucket {
|
||||
return {
|
||||
experimentName: name,
|
||||
scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
|
||||
};
|
||||
}
|
||||
|
||||
function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
|
||||
return { testCaseFile: file, scenarioName: scenario, passed, total };
|
||||
}
|
||||
|
||||
/** Minimal evaluation fixture matching the shape format.ts reads. */
|
||||
function evaluation(
|
||||
opts: {
|
||||
totalRuns?: number;
|
||||
testCases?: Array<{
|
||||
prompt?: string;
|
||||
buildSuccessCount?: number;
|
||||
scenarios?: Array<{
|
||||
name: string;
|
||||
passCount: number;
|
||||
passes: boolean[]; // per-iteration pass/fail
|
||||
reasoning?: string;
|
||||
failureCategory?: string;
|
||||
}>;
|
||||
}>;
|
||||
} = {},
|
||||
): MultiRunEvaluation {
|
||||
const totalRuns = opts.totalRuns ?? 3;
|
||||
return {
|
||||
totalRuns,
|
||||
testCases: (opts.testCases ?? []).map((tc) => {
|
||||
const testCase = {
|
||||
prompt: tc.prompt ?? 'Test workflow prompt',
|
||||
complexity: 'medium' as const,
|
||||
tags: [],
|
||||
scenarios: (tc.scenarios ?? []).map((sa) => ({
|
||||
name: sa.name,
|
||||
description: '',
|
||||
dataSetup: '',
|
||||
successCriteria: '',
|
||||
})),
|
||||
} as WorkflowTestCase;
|
||||
const buildSuccessCount = tc.buildSuccessCount ?? totalRuns;
|
||||
const scenarios = (tc.scenarios ?? []).map((sa) => ({
|
||||
scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
|
||||
passCount: sa.passCount,
|
||||
passRate: totalRuns > 0 ? sa.passCount / totalRuns : 0,
|
||||
passAtK: new Array(totalRuns).fill(sa.passCount > 0 ? 1 : 0) as number[],
|
||||
passHatK: new Array(totalRuns).fill(sa.passCount === totalRuns ? 1 : 0) as number[],
|
||||
runs: sa.passes.map(
|
||||
(passed): ScenarioResult => ({
|
||||
scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
|
||||
success: passed,
|
||||
score: passed ? 1 : 0,
|
||||
reasoning: sa.reasoning ?? '',
|
||||
failureCategory: !passed ? sa.failureCategory : undefined,
|
||||
}),
|
||||
),
|
||||
}));
|
||||
return {
|
||||
testCase,
|
||||
workflowBuildSuccess: buildSuccessCount > 0,
|
||||
scenarioResults: [],
|
||||
scenarios,
|
||||
runs: new Array(totalRuns).fill(null).map(() => ({
|
||||
testCase,
|
||||
workflowBuildSuccess: buildSuccessCount > 0,
|
||||
scenarioResults: [],
|
||||
})),
|
||||
buildSuccessCount,
|
||||
};
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
describe('formatComparisonMarkdown', () => {
|
||||
const evalFixture = evaluation({
|
||||
totalRuns: 3,
|
||||
testCases: [
|
||||
{
|
||||
prompt: 'a',
|
||||
scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
it('renders heading, alert, aggregate, and a regression table', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
|
||||
const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
|
||||
|
||||
expect(md).toMatch(/### Instance AI Workflow Eval/);
|
||||
expect(md).toMatch(/> \[!CAUTION\]/);
|
||||
expect(md).toMatch(/1 regression/);
|
||||
expect(md).toMatch(/\*\*Aggregate\*\*: 0\.0% PR vs 100\.0% baseline/);
|
||||
expect(md).toMatch(/#### Regressions \(1\)/);
|
||||
expect(md).toMatch(/`a\/happy`/);
|
||||
expect(md).toMatch(/0\/3 \(0%\)/);
|
||||
expect(md).toMatch(/-100pp ↓/);
|
||||
});
|
||||
|
||||
it('uses TIP alert when there are only improvements', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 3, 3)]);
|
||||
const base = bucket('master', [s('a', 'happy', 0, 10)]);
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
|
||||
|
||||
expect(md).toMatch(/> \[!TIP\]/);
|
||||
expect(md).toMatch(/1 improvement/);
|
||||
expect(md).toMatch(/#### Improvements \(1\)/);
|
||||
expect(md).toMatch(/\+100pp ↑/);
|
||||
});
|
||||
|
||||
it('uses TIP alert with "0 regressions" when everything is stable', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
|
||||
const base = bucket('master', [s('a', 'happy', 8, 10)]);
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
|
||||
|
||||
expect(md).toMatch(/> \[!TIP\]/);
|
||||
expect(md).toMatch(/0 regressions/);
|
||||
expect(md).toMatch(/1 stable/);
|
||||
expect(md).not.toMatch(/#### Regressions/);
|
||||
});
|
||||
|
||||
it('renders LangSmith-disabled NOTE when outcome is undefined', () => {
|
||||
const md = formatComparisonMarkdown(evalFixture);
|
||||
expect(md).toMatch(/> \[!NOTE\]/);
|
||||
expect(md).toMatch(/LangSmith disabled/);
|
||||
expect(md).not.toMatch(/#### Regressions/);
|
||||
});
|
||||
|
||||
it('renders distinct alerts per skip reason', () => {
|
||||
const noBase = formatComparisonMarkdown(evalFixture, { kind: 'no_baseline' });
|
||||
expect(noBase).toMatch(/> \[!NOTE\]/);
|
||||
expect(noBase).toMatch(/No baseline configured/);
|
||||
|
||||
const selfBase = formatComparisonMarkdown(evalFixture, {
|
||||
kind: 'self_baseline',
|
||||
experimentName: 'instance-ai-baseline-abc',
|
||||
});
|
||||
expect(selfBase).toMatch(/> \[!NOTE\]/);
|
||||
expect(selfBase).toMatch(/This run is the baseline/);
|
||||
expect(selfBase).toMatch(/instance-ai-baseline-abc/);
|
||||
|
||||
const fetchFail = formatComparisonMarkdown(evalFixture, {
|
||||
kind: 'fetch_failed',
|
||||
error: 'LangSmith 503',
|
||||
});
|
||||
// fetch_failed is a real outage, not a benign skip — must be a WARNING.
|
||||
expect(fetchFail).toMatch(/> \[!WARNING\]/);
|
||||
expect(fetchFail).toMatch(/Regression detection did not run/);
|
||||
expect(fetchFail).toMatch(/LangSmith 503/);
|
||||
});
|
||||
|
||||
it('shows mixed-case alert when both regressions and improvements exist', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 0, 3), s('b', 'happy', 3, 3)]);
|
||||
const base = bucket('master', [s('a', 'happy', 10, 10), s('b', 'happy', 0, 10)]);
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
|
||||
expect(md).toMatch(/> \[!CAUTION\]/);
|
||||
expect(md).toMatch(/1 regression/);
|
||||
expect(md).toMatch(/1 improvement/);
|
||||
expect(md).toMatch(/#### Regressions/);
|
||||
expect(md).toMatch(/#### Improvements/);
|
||||
});
|
||||
|
||||
it('embeds commit SHA in heading when provided', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
|
||||
const base = bucket('master', [s('a', 'happy', 8, 10)]);
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)), {
|
||||
commitSha: 'abc1234567890def',
|
||||
});
|
||||
expect(md).toMatch(/### Instance AI Workflow Eval — `abc12345`/);
|
||||
});
|
||||
|
||||
it('marks new failure categories with 🆕', () => {
|
||||
const pr: ExperimentBucket = {
|
||||
experimentName: 'pr',
|
||||
scenarios: new Map([['a/happy', { ...s('a', 'happy', 0, 3) }]]),
|
||||
failureCategoryTotals: { framework_issue: 9 },
|
||||
trialTotal: 145,
|
||||
};
|
||||
const base: ExperimentBucket = {
|
||||
experimentName: 'master',
|
||||
scenarios: new Map([['a/happy', { ...s('a', 'happy', 5, 10) }]]),
|
||||
failureCategoryTotals: { framework_issue: 0 },
|
||||
trialTotal: 290,
|
||||
};
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
|
||||
expect(md).toMatch(/#### Failure breakdown/);
|
||||
expect(md).toMatch(/`framework_issue` 🆕/);
|
||||
expect(md).toMatch(/\*\*notable\*\*/);
|
||||
});
|
||||
|
||||
it('always includes all five tier counts in the alert line', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
|
||||
const base = bucket('master', [s('a', 'happy', 8, 10)]);
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
|
||||
expect(md).toMatch(/0 regressions, 0 soft, 0 notable, 0 improvements, 1 stable/);
|
||||
});
|
||||
|
||||
it('renders a per-scenario breakdown collapsible inside the regression section', () => {
|
||||
const evalWithFailures = evaluation({
|
||||
totalRuns: 3,
|
||||
testCases: [
|
||||
{
|
||||
prompt: 'a',
|
||||
scenarios: [
|
||||
{
|
||||
name: 'happy',
|
||||
passCount: 0,
|
||||
passes: [false, false, false],
|
||||
reasoning: 'Builder produced an unsupported node configuration',
|
||||
failureCategory: 'builder_issue',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
|
||||
const base = bucket('master', [s('a', 'happy', 10, 10)]);
|
||||
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
|
||||
slugByTestCase: slugMap(evalWithFailures, ['a']),
|
||||
});
|
||||
|
||||
expect(md).toMatch(/#### Regressions \(1\)/);
|
||||
// The regression row's collapsible should appear inside the Regressions
|
||||
// section, before the per-test-case section, and carry the same slug.
|
||||
const regressionsIdx = md.indexOf('#### Regressions');
|
||||
const perTcIdx = md.indexOf('Per-test-case results');
|
||||
const breakdownIdx = md.indexOf('<code>a/happy</code>');
|
||||
expect(breakdownIdx).toBeGreaterThan(regressionsIdx);
|
||||
expect(breakdownIdx).toBeLessThan(perTcIdx);
|
||||
expect(md).toMatch(/3 of 3 failed · 3× builder_issue/);
|
||||
expect(md).toMatch(/Run 1 \[builder_issue\]: Builder produced/);
|
||||
});
|
||||
|
||||
it('uses `file/scenario` slug headers in the bottom Failure details section', () => {
|
||||
const evalWithFailures = evaluation({
|
||||
totalRuns: 3,
|
||||
testCases: [
|
||||
{
|
||||
prompt: 'Build a cross-team Linear report digest',
|
||||
scenarios: [
|
||||
{
|
||||
name: 'no-cross-team-issues',
|
||||
passCount: 0,
|
||||
passes: [false, false, false],
|
||||
reasoning: 'reason',
|
||||
failureCategory: 'builder_issue',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
const pr = bucket('pr', [s('cross-team-linear-report', 'no-cross-team-issues', 0, 3)]);
|
||||
const base = bucket('master', [s('cross-team-linear-report', 'no-cross-team-issues', 10, 10)]);
|
||||
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
|
||||
slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report']),
|
||||
});
|
||||
|
||||
expect(md).toMatch(/<summary>Failure details<\/summary>/);
|
||||
expect(md).toMatch(/\*\*`cross-team-linear-report\/no-cross-team-issues`\*\* — 3 failed/);
|
||||
});
|
||||
|
||||
it('attaches per-scenario failures to the right file slug when names collide', () => {
|
||||
// Two test cases each defining `happy-path`. Without the slug map,
|
||||
// the renderer would conflate them — Albert's review flagged this
|
||||
// exact bug. With the map, each row's collapsible carries only that
|
||||
// row's failures.
|
||||
const evalWithFailures = evaluation({
|
||||
totalRuns: 3,
|
||||
testCases: [
|
||||
{
|
||||
prompt: 'cross-team prompt',
|
||||
scenarios: [
|
||||
{
|
||||
name: 'happy-path',
|
||||
passCount: 0,
|
||||
passes: [false, false, false],
|
||||
reasoning: 'Linear node misconfigured',
|
||||
failureCategory: 'builder_issue',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
prompt: 'weather prompt',
|
||||
scenarios: [
|
||||
{
|
||||
name: 'happy-path',
|
||||
passCount: 0,
|
||||
passes: [false, false, false],
|
||||
reasoning: 'Weather mock returned empty',
|
||||
failureCategory: 'mock_issue',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
const pr = bucket('pr', [
|
||||
s('cross-team-linear-report', 'happy-path', 0, 3),
|
||||
s('weather-monitoring', 'happy-path', 0, 3),
|
||||
]);
|
||||
const base = bucket('master', [
|
||||
s('cross-team-linear-report', 'happy-path', 10, 10),
|
||||
s('weather-monitoring', 'happy-path', 10, 10),
|
||||
]);
|
||||
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
|
||||
slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report', 'weather-monitoring']),
|
||||
});
|
||||
|
||||
// Each per-scenario collapsible (under the regression table) must show
|
||||
// ONLY its own failures. Slice each block at its closing </details>.
|
||||
function collapsibleFor(slug: string): string {
|
||||
const open = md.indexOf(`<code>${slug}</code>`);
|
||||
expect(open).toBeGreaterThan(-1);
|
||||
const close = md.indexOf('</details>', open);
|
||||
return md.slice(open, close);
|
||||
}
|
||||
const crossTeamBlock = collapsibleFor('cross-team-linear-report/happy-path');
|
||||
const weatherBlock = collapsibleFor('weather-monitoring/happy-path');
|
||||
expect(crossTeamBlock).toMatch(/Linear node misconfigured/);
|
||||
expect(crossTeamBlock).not.toMatch(/Weather mock returned empty/);
|
||||
expect(weatherBlock).toMatch(/Weather mock returned empty/);
|
||||
expect(weatherBlock).not.toMatch(/Linear node misconfigured/);
|
||||
});
|
||||
|
||||
it('uses the slug instead of the prompt in the per-test-case table', () => {
|
||||
const evalFx = evaluation({
|
||||
totalRuns: 3,
|
||||
testCases: [
|
||||
{
|
||||
prompt: 'Build a cross-team Linear report digest from open issues',
|
||||
scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
|
||||
},
|
||||
],
|
||||
});
|
||||
const pr = bucket('pr', [s('cross-team-linear-report', 'happy', 0, 3)]);
|
||||
const base = bucket('master', [s('cross-team-linear-report', 'happy', 10, 10)]);
|
||||
const md = formatComparisonMarkdown(evalFx, ok(compareBuckets(pr, base)), {
|
||||
slugByTestCase: slugMap(evalFx, ['cross-team-linear-report']),
|
||||
});
|
||||
|
||||
// Per-test-case table cell should be the slug, not the prompt.
|
||||
const perTcSection = md.slice(md.indexOf('Per-test-case results'));
|
||||
expect(perTcSection).toMatch(/`cross-team-linear-report`/);
|
||||
expect(perTcSection).not.toMatch(/Build a cross-team Linear report digest/);
|
||||
});
|
||||
|
||||
it('skips per-scenario breakdown when slugByTestCase is omitted', () => {
|
||||
// Without the slug map, the renderer can't disambiguate. We'd rather
|
||||
// drop the breakdown than show a wrong one.
|
||||
const evalWithFailures = evaluation({
|
||||
totalRuns: 3,
|
||||
testCases: [
|
||||
{
|
||||
prompt: 'a',
|
||||
scenarios: [
|
||||
{
|
||||
name: 'happy',
|
||||
passCount: 0,
|
||||
passes: [false, false, false],
|
||||
reasoning: 'Some failure',
|
||||
failureCategory: 'builder_issue',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
|
||||
const base = bucket('master', [s('a', 'happy', 10, 10)]);
|
||||
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)));
|
||||
|
||||
// Regression table still rendered.
|
||||
expect(md).toMatch(/#### Regressions \(1\)/);
|
||||
// But no per-scenario collapsible (which would have used <code>a/happy</code>
|
||||
// with the breakdown summary text).
|
||||
expect(md).not.toMatch(/3 of 3 failed · 3× builder_issue/);
|
||||
});
|
||||
|
||||
it('renders the failure breakdown for non-notable categories with non-zero counts', () => {
|
||||
// 50/100 vs 50/100 — no scenario regression, but still has builder_issue
|
||||
// counts on both sides (non-notable but non-zero).
|
||||
const pr: ExperimentBucket = {
|
||||
experimentName: 'pr',
|
||||
scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
|
||||
failureCategoryTotals: { builder_issue: 25 },
|
||||
trialTotal: 100,
|
||||
};
|
||||
const base: ExperimentBucket = {
|
||||
experimentName: 'master',
|
||||
scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
|
||||
failureCategoryTotals: { builder_issue: 22 },
|
||||
trialTotal: 100,
|
||||
};
|
||||
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
|
||||
expect(md).toMatch(/#### Failure breakdown/);
|
||||
expect(md).toMatch(/`builder_issue`/);
|
||||
// builder_issue isn't notable here, so no "notable" marker.
|
||||
expect(md).not.toMatch(/builder_issue.*notable/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('formatComparisonTerminal', () => {
|
||||
const evalFixture = evaluation({
|
||||
totalRuns: 3,
|
||||
testCases: [
|
||||
{
|
||||
prompt: 'a',
|
||||
scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
it('renders title, verdict, aggregate, and regression table without markdown syntax', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
|
||||
const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
|
||||
const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
|
||||
expect(out).toMatch(/^Instance AI Workflow Eval/);
|
||||
expect(out).toMatch(/▶ 1 regression/);
|
||||
expect(out).toMatch(/PR\s{8}0\.0%/);
|
||||
expect(out).toMatch(/baseline\s{2}100\.0%/);
|
||||
expect(out).toMatch(/REGRESSIONS/);
|
||||
expect(out).toMatch(/a\/happy/);
|
||||
expect(out).not.toMatch(/^###/m);
|
||||
expect(out).not.toMatch(/\| /);
|
||||
});
|
||||
|
||||
it('renders LangSmith-disabled message when outcome is undefined', () => {
|
||||
const out = formatComparisonTerminal(evalFixture);
|
||||
expect(out).toMatch(/LangSmith disabled/);
|
||||
expect(out).not.toMatch(/REGRESSIONS/);
|
||||
});
|
||||
|
||||
it('shows partial banner when scenarios differ on each side', () => {
|
||||
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
|
||||
const base = bucket('master', [s('a', 'happy', 8, 10), s('b', 'happy', 5, 10)]);
|
||||
const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
|
||||
expect(out).toMatch(/partial: 1 baseline scenarios not run by PR/);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,161 @@
|
|||
import {
|
||||
classifyScenario,
|
||||
fishersExactOneSidedLeft,
|
||||
wilsonInterval,
|
||||
} from '../comparison/statistics';
|
||||
|
||||
describe('fishersExactOneSidedLeft', () => {
|
||||
it('returns 1 when either row is empty (no information)', () => {
|
||||
expect(fishersExactOneSidedLeft(0, 0, 5, 5)).toBe(1);
|
||||
expect(fishersExactOneSidedLeft(5, 5, 0, 0)).toBe(1);
|
||||
});
|
||||
|
||||
it('returns 1 when no failures or no passes are observed (no test possible)', () => {
|
||||
expect(fishersExactOneSidedLeft(3, 0, 5, 0)).toBe(1);
|
||||
expect(fishersExactOneSidedLeft(0, 3, 0, 5)).toBe(1);
|
||||
});
|
||||
|
||||
it('matches a known textbook case', () => {
|
||||
// 2x2 table where PR (1/3) is much worse than baseline (10/10).
|
||||
// Hypergeometric: P(X = 0) + P(X = 1) | drawn=3 from passes=11, fails=2
|
||||
// = C(11,0)C(2,3)/C(13,3) + C(11,1)C(2,2)/C(13,3)
|
||||
// = 0 + 11/286 ≈ 0.03846
|
||||
const p = fishersExactOneSidedLeft(1, 2, 10, 0);
|
||||
expect(p).toBeCloseTo(0.03846, 4);
|
||||
});
|
||||
|
||||
it('returns p = 1 when PR pass rate equals baseline at maximum', () => {
|
||||
// PR all pass, baseline all pass — under H0 the observed PR is the most likely outcome,
|
||||
// so the left-tail (X ≤ a) p-value is exactly 1.
|
||||
const p = fishersExactOneSidedLeft(5, 0, 5, 0);
|
||||
expect(p).toBe(1);
|
||||
});
|
||||
|
||||
it('detects a strong regression with high N', () => {
|
||||
// PR 0/10, baseline 10/10 — extremely strong evidence PR is worse.
|
||||
const p = fishersExactOneSidedLeft(0, 10, 10, 0);
|
||||
expect(p).toBeLessThan(0.001);
|
||||
});
|
||||
|
||||
it('returns 1 when PR matches baseline rates exactly', () => {
|
||||
// PR 5/10, baseline 5/10 — left tail at the median is around 0.5 + symmetric mass
|
||||
// at the observed value, but should be > 0.5 (we're at the center of the distribution).
|
||||
const p = fishersExactOneSidedLeft(5, 5, 5, 5);
|
||||
expect(p).toBeGreaterThan(0.5);
|
||||
});
|
||||
});
|
||||
|
||||
describe('wilsonInterval', () => {
|
||||
it('returns [0, 1] for total=0', () => {
|
||||
expect(wilsonInterval(0, 0)).toEqual({ lower: 0, upper: 1 });
|
||||
});
|
||||
|
||||
it('produces reasonable bounds for 5/10', () => {
|
||||
const ci = wilsonInterval(5, 10);
|
||||
// Known Wilson 95% CI for 5/10: roughly [0.237, 0.763]
|
||||
expect(ci.lower).toBeCloseTo(0.237, 2);
|
||||
expect(ci.upper).toBeCloseTo(0.763, 2);
|
||||
});
|
||||
|
||||
it('produces tight bounds for 0/100', () => {
|
||||
const ci = wilsonInterval(0, 100);
|
||||
expect(ci.lower).toBe(0);
|
||||
expect(ci.upper).toBeLessThan(0.05);
|
||||
});
|
||||
|
||||
it('produces tight bounds for 100/100', () => {
|
||||
const ci = wilsonInterval(100, 100);
|
||||
// upper analytically equals 1 but lands slightly under it after FP rounding —
|
||||
// any reasonable CI for 100/100 should still be tight to the top of the range.
|
||||
expect(ci.upper).toBeGreaterThanOrEqual(0.99);
|
||||
expect(ci.lower).toBeGreaterThan(0.95);
|
||||
});
|
||||
|
||||
it('throws when passes > total', () => {
|
||||
expect(() => wilsonInterval(5, 3)).toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe('classifyScenario', () => {
|
||||
it('flags a clear regression on a reliable scenario as hard_regression', () => {
|
||||
const result = classifyScenario(0, 10, 10, 10);
|
||||
expect(result.verdict).toBe('hard_regression');
|
||||
expect(result.delta).toBe(-1);
|
||||
});
|
||||
|
||||
it('marks a hard-significant drop on an unreliable baseline as unreliable_baseline', () => {
|
||||
// Baseline 4/10 (40%) — below hard reliable (70%). PR 0/10 is a 40pp drop with
|
||||
// Fisher p < 0.05. We surface it as `unreliable_baseline` rather than flagging.
|
||||
const result = classifyScenario(0, 10, 4, 10);
|
||||
expect(result.verdict).toBe('unreliable_baseline');
|
||||
});
|
||||
|
||||
it('reports stable when the drop is sub-MDE on a flaky baseline', () => {
|
||||
// Baseline 1/10 (flaky), PR 0/10 — only a 10pp drop, below MDE.
|
||||
const result = classifyScenario(0, 10, 1, 10);
|
||||
expect(result.verdict).toBe('stable');
|
||||
});
|
||||
|
||||
it('does not flag a small drop below the soft MDE threshold', () => {
|
||||
// 9/10 vs 10/10 = 10pp drop, below soft MDE (15pp).
|
||||
const result = classifyScenario(9, 10, 10, 10);
|
||||
expect(result.verdict).toBe('stable');
|
||||
});
|
||||
|
||||
it('flags an improvement when PR is significantly better', () => {
|
||||
const result = classifyScenario(10, 10, 0, 10);
|
||||
expect(result.verdict).toBe('improvement');
|
||||
});
|
||||
|
||||
it('flags improvement even on a never-passing baseline', () => {
|
||||
// "Never passes" baseline (0/10) — fix is worth surfacing without the reliability gate.
|
||||
const result = classifyScenario(8, 10, 0, 10);
|
||||
expect(result.verdict).toBe('improvement');
|
||||
});
|
||||
|
||||
it('returns insufficient_data when either side has no trials', () => {
|
||||
expect(classifyScenario(0, 0, 5, 10).verdict).toBe('insufficient_data');
|
||||
expect(classifyScenario(5, 10, 0, 0).verdict).toBe('insufficient_data');
|
||||
});
|
||||
|
||||
it('flags the most extreme outcome at minimum N as hard_regression', () => {
|
||||
// PR 0/3 vs baseline 3/3 — Fisher one-sided p ≈ 0.05, delta = -100pp.
|
||||
const result = classifyScenario(0, 3, 3, 3);
|
||||
expect(result.verdict).toBe('hard_regression');
|
||||
});
|
||||
|
||||
it('reports stable when N is small enough that even a full flip is sub-significant for soft tier', () => {
|
||||
// PR 1/2 vs baseline 2/2 — delta -50pp but Fisher p ≈ 0.5 (way above soft α=0.20).
|
||||
// Soft MDE met, but significance fails on both tiers.
|
||||
const result = classifyScenario(1, 2, 2, 2);
|
||||
expect(['stable', 'watch']).toContain(result.verdict);
|
||||
});
|
||||
|
||||
it('marks soft regression when hard delta is missed but soft thresholds met', () => {
|
||||
// 6/10 vs 10/10 = 40pp drop, p ≈ 0.043, baseline 100% reliable.
|
||||
// Hard defaults would flag this; force a stricter hard delta to push it to soft.
|
||||
const result = classifyScenario(6, 10, 10, 10, {
|
||||
hard: { maxPValue: 0.05, minDelta: 0.5, minBaselinePassRate: 0.7 },
|
||||
soft: { maxPValue: 0.2, minDelta: 0.15, minBaselinePassRate: 0.5 },
|
||||
});
|
||||
expect(result.verdict).toBe('soft_regression');
|
||||
});
|
||||
|
||||
it('marks watch when delta crosses the watch threshold without significance', () => {
|
||||
// 5/10 vs 7/10 = -20pp drop, p ≈ 0.32 — not significant for hard or soft.
|
||||
// Default watchDelta is 0.35, so this should not be `watch`. Force it via
|
||||
// a smaller threshold to validate the path.
|
||||
const result = classifyScenario(5, 10, 7, 10, { watchDelta: 0.15 });
|
||||
expect(result.verdict).toBe('watch');
|
||||
});
|
||||
|
||||
it('respects custom hard-tier delta override', () => {
|
||||
// 7/10 vs 10/10 = 30pp delta. Default hard minDelta is 0.3, so this barely qualifies.
|
||||
// With hard.minDelta 0.4, it drops into `soft_regression` (still passes soft 0.15 minDelta).
|
||||
// p ≈ 0.105 < soft maxPValue (0.2), so soft fires.
|
||||
const result = classifyScenario(7, 10, 10, 10, {
|
||||
hard: { minDelta: 0.4 },
|
||||
});
|
||||
expect(result.verdict).toBe('soft_regression');
|
||||
});
|
||||
});
|
||||
|
|
@ -45,7 +45,7 @@ export interface CliArgs {
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
const cliArgsSchema = z.object({
|
||||
timeoutMs: z.number().int().positive().default(600_000),
|
||||
timeoutMs: z.number().int().positive().default(900_000),
|
||||
baseUrls: z.array(z.string().url()).min(1).default(['http://localhost:5678']),
|
||||
email: z.string().optional(),
|
||||
password: z.string().optional(),
|
||||
|
|
@ -104,7 +104,7 @@ interface RawArgs {
|
|||
|
||||
function parseRawArgs(argv: string[]): RawArgs {
|
||||
const result: RawArgs = {
|
||||
timeoutMs: 600_000,
|
||||
timeoutMs: 900_000,
|
||||
baseUrls: ['http://localhost:5678'],
|
||||
verbose: false,
|
||||
keepWorkflows: false,
|
||||
|
|
|
|||
|
|
@ -23,6 +23,15 @@ import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
|
|||
import { LaneAllocator } from './lane-allocator';
|
||||
import { expandWithIterations, partitionRoundRobin } from './lanes';
|
||||
import { N8nClient } from '../clients/n8n-client';
|
||||
import {
|
||||
compareBuckets,
|
||||
type ComparisonOutcome,
|
||||
type ComparisonResult,
|
||||
type ExperimentBucket,
|
||||
type ScenarioCounts,
|
||||
} from '../comparison/compare';
|
||||
import { fetchBaselineBucket, findLatestBaseline } from '../comparison/fetch-baseline';
|
||||
import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
|
||||
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
|
||||
import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
|
||||
import type { WorkflowTestCaseWithFile } from '../data/workflows';
|
||||
|
|
@ -43,6 +52,7 @@ import type {
|
|||
MultiRunEvaluation,
|
||||
ScenarioResult,
|
||||
TestScenario,
|
||||
WorkflowTestCase,
|
||||
WorkflowTestCaseResult,
|
||||
} from '../types';
|
||||
|
||||
|
|
@ -160,21 +170,40 @@ async function main(): Promise<void> {
|
|||
const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);
|
||||
|
||||
let evaluation: MultiRunEvaluation;
|
||||
let experimentName: string | undefined;
|
||||
let outcome: ComparisonOutcome | undefined;
|
||||
let slugByTestCase: Map<WorkflowTestCase, string> | undefined;
|
||||
|
||||
if (hasLangSmith) {
|
||||
logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
|
||||
evaluation = await runWithLangSmith({ args, lanes, logger });
|
||||
const langsmithRun = await runWithLangSmith({ args, lanes, logger });
|
||||
evaluation = langsmithRun.evaluation;
|
||||
experimentName = langsmithRun.experimentName;
|
||||
outcome = langsmithRun.outcome;
|
||||
slugByTestCase = langsmithRun.slugByTestCase;
|
||||
} else {
|
||||
logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
|
||||
evaluation = await runDirectLoop({ args, lanes, logger });
|
||||
}
|
||||
|
||||
const totalDuration = Date.now() - startTime;
|
||||
const outputPath = writeEvalResults(evaluation, totalDuration, args.outputDir);
|
||||
console.log(`Results: ${outputPath}`);
|
||||
const commitSha = process.env.LANGSMITH_REVISION_ID ?? process.env.GITHUB_SHA;
|
||||
const { jsonPath, prCommentPath } = writeEvalResults(
|
||||
evaluation,
|
||||
totalDuration,
|
||||
args.outputDir,
|
||||
experimentName,
|
||||
outcome,
|
||||
commitSha,
|
||||
slugByTestCase,
|
||||
);
|
||||
console.log(`Results: ${jsonPath}`);
|
||||
console.log(`PR comment: ${prCommentPath}`);
|
||||
const htmlPath = writeWorkflowReport(flattenRunsForReport(evaluation));
|
||||
console.log(`Report: ${htmlPath}`);
|
||||
printSummary(evaluation);
|
||||
console.log(`Report: ${htmlPath}`);
|
||||
console.log(
|
||||
'\n' + formatComparisonTerminal(evaluation, outcome, { commitSha, slugByTestCase }),
|
||||
);
|
||||
} finally {
|
||||
await Promise.all(
|
||||
lanes.map(async (lane) => {
|
||||
|
|
@ -188,7 +217,12 @@ async function main(): Promise<void> {
|
|||
// LangSmith mode: evaluate() with dataset sync, tracing, experiments
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation> {
|
||||
async function runWithLangSmith(config: RunConfig): Promise<{
|
||||
evaluation: MultiRunEvaluation;
|
||||
experimentName: string;
|
||||
outcome: ComparisonOutcome;
|
||||
slugByTestCase: Map<WorkflowTestCase, string>;
|
||||
}> {
|
||||
const { args, lanes, logger } = config;
|
||||
|
||||
const lsClient = new Client();
|
||||
|
|
@ -466,7 +500,24 @@ async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation>
|
|||
logger,
|
||||
});
|
||||
|
||||
return evaluation;
|
||||
const outcome = await tryRunComparison({
|
||||
lsClient,
|
||||
prExperimentName: experimentResults.experimentName,
|
||||
evaluation,
|
||||
testCasesWithFiles,
|
||||
logger,
|
||||
});
|
||||
|
||||
const slugByTestCase = new Map<WorkflowTestCase, string>(
|
||||
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
|
||||
);
|
||||
|
||||
return {
|
||||
evaluation,
|
||||
experimentName: experimentResults.experimentName,
|
||||
outcome,
|
||||
slugByTestCase,
|
||||
};
|
||||
} finally {
|
||||
if (!args.keepWorkflows) {
|
||||
await Promise.all(
|
||||
|
|
@ -826,15 +877,22 @@ function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
|
|||
function writeEvalResults(
|
||||
evaluation: MultiRunEvaluation,
|
||||
duration: number,
|
||||
outputDir?: string,
|
||||
): string {
|
||||
outputDir: string | undefined,
|
||||
experimentName: string | undefined,
|
||||
outcome: ComparisonOutcome | undefined,
|
||||
commitSha: string | undefined,
|
||||
slugByTestCase: Map<WorkflowTestCase, string> | undefined,
|
||||
): { jsonPath: string; prCommentPath: string } {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
const metrics = computeAggregateMetrics(evaluation);
|
||||
|
||||
const result = outcome?.kind === 'ok' ? outcome.result : undefined;
|
||||
|
||||
const report = {
|
||||
timestamp: new Date().toISOString(),
|
||||
duration,
|
||||
totalRuns,
|
||||
experimentName,
|
||||
summary: {
|
||||
testCases: testCases.length,
|
||||
built: metrics.built,
|
||||
|
|
@ -843,6 +901,19 @@ function writeEvalResults(
|
|||
passHatK: metrics.passHatK,
|
||||
passRatePerIter: metrics.passRatePerIter,
|
||||
},
|
||||
// Structured comparison payload only — the rendered markdown lives in
|
||||
// the sibling `eval-pr-comment.md` file so consumers can pick the format
|
||||
// they want without re-running the eval. `comparisonStatus` records why
|
||||
// the comparison was skipped when applicable, so JSON consumers can
|
||||
// distinguish "no baseline yet" from "regression detection broke".
|
||||
comparison: result
|
||||
? {
|
||||
baseline: result.baseline.experimentName,
|
||||
result: serializeComparison(result),
|
||||
}
|
||||
: undefined,
|
||||
comparisonStatus: outcome?.kind ?? 'not_attempted',
|
||||
comparisonError: outcome?.kind === 'fetch_failed' ? outcome.error : undefined,
|
||||
testCases: testCases.map((tc) => ({
|
||||
name: tc.testCase.prompt.slice(0, 70),
|
||||
buildSuccessCount: tc.buildSuccessCount,
|
||||
|
|
@ -868,74 +939,137 @@ function writeEvalResults(
|
|||
|
||||
const targetDir = outputDir ?? process.cwd();
|
||||
mkdirSync(targetDir, { recursive: true });
|
||||
const outputPath = join(targetDir, 'eval-results.json');
|
||||
writeFileSync(outputPath, JSON.stringify(report, null, 2));
|
||||
return outputPath;
|
||||
const jsonPath = join(targetDir, 'eval-results.json');
|
||||
writeFileSync(jsonPath, JSON.stringify(report, null, 2));
|
||||
|
||||
// Always write the rendered PR comment — the markdown formatter handles
|
||||
// both with-comparison and no-baseline cases. CI consumes this file
|
||||
// directly; local users get a copy-pasteable artifact.
|
||||
const prCommentPath = join(targetDir, 'eval-pr-comment.md');
|
||||
writeFileSync(
|
||||
prCommentPath,
|
||||
formatComparisonMarkdown(evaluation, outcome, { commitSha, slugByTestCase }),
|
||||
);
|
||||
|
||||
return { jsonPath, prCommentPath };
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert ComparisonResult into a JSON-serializable shape (Maps don't survive
|
||||
* JSON.stringify by default).
|
||||
*/
|
||||
function serializeComparison(result: ComparisonResult): {
|
||||
pr: { experimentName: string };
|
||||
baseline: { experimentName: string };
|
||||
aggregate: ComparisonResult['aggregate'];
|
||||
scenarios: ComparisonResult['scenarios'];
|
||||
prOnly: ComparisonResult['prOnly'];
|
||||
baselineOnly: ComparisonResult['baselineOnly'];
|
||||
failureCategories: ComparisonResult['failureCategories'];
|
||||
} {
|
||||
return {
|
||||
pr: result.pr,
|
||||
baseline: result.baseline,
|
||||
aggregate: result.aggregate,
|
||||
scenarios: result.scenarios,
|
||||
prOnly: result.prOnly,
|
||||
baselineOnly: result.baselineOnly,
|
||||
failureCategories: result.failureCategories,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Console summary
|
||||
// Comparison vs the pinned baseline experiment
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function printSummary(evaluation: MultiRunEvaluation): void {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
const multiRun = totalRuns > 1;
|
||||
const metrics = computeAggregateMetrics(evaluation);
|
||||
/**
|
||||
* Best-effort comparison. Returns a tagged outcome so the PR comment can
|
||||
* distinguish "no baseline yet" / "this run IS the baseline" from a real
|
||||
* regression-detection outage (LangSmith down, fetch failure). Never throws
|
||||
* — the eval run is not gated on the comparison.
|
||||
*/
|
||||
async function tryRunComparison(config: {
|
||||
lsClient: Client;
|
||||
prExperimentName: string;
|
||||
evaluation: MultiRunEvaluation;
|
||||
testCasesWithFiles: WorkflowTestCaseWithFile[];
|
||||
logger: EvalLogger;
|
||||
}): Promise<ComparisonOutcome> {
|
||||
const { lsClient, prExperimentName, evaluation, testCasesWithFiles, logger } = config;
|
||||
|
||||
console.log('\n=== Workflow Eval Results ===\n');
|
||||
for (const tc of testCases) {
|
||||
console.log(`${tc.testCase.prompt.slice(0, 70)}...`);
|
||||
|
||||
if (multiRun) {
|
||||
console.log(` Build: ${String(tc.buildSuccessCount)}/${String(totalRuns)} runs`);
|
||||
} else {
|
||||
const r = tc.runs[0];
|
||||
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
|
||||
console.log(` Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
|
||||
if (r.buildError) {
|
||||
console.log(` Error: ${r.buildError.slice(0, 200)}`);
|
||||
}
|
||||
try {
|
||||
const baselineName = await findLatestBaseline(lsClient);
|
||||
if (!baselineName) {
|
||||
logger.verbose(
|
||||
'No baseline experiment found — skipping comparison. ' +
|
||||
'Run with --experiment-name instance-ai-baseline to create one.',
|
||||
);
|
||||
return { kind: 'no_baseline' };
|
||||
}
|
||||
if (baselineName === prExperimentName) {
|
||||
logger.verbose('Current run is the baseline — skipping comparison.');
|
||||
return { kind: 'self_baseline', experimentName: baselineName };
|
||||
}
|
||||
|
||||
logger.info(`Comparing against baseline: ${baselineName}`);
|
||||
const baseline = await fetchBaselineBucket(lsClient, baselineName);
|
||||
const pr = bucketFromEvaluation(evaluation, testCasesWithFiles, prExperimentName);
|
||||
return { kind: 'ok', result: compareBuckets(pr, baseline) };
|
||||
} catch (error: unknown) {
|
||||
const msg = error instanceof Error ? error.message : String(error);
|
||||
logger.warn(`Comparison vs baseline failed: ${msg}`);
|
||||
return { kind: 'fetch_failed', error: msg };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Project the in-memory MultiRunEvaluation onto the bucket shape used by
|
||||
* fetchBaselineBucket, keyed by `${fileSlug}/${scenarioName}`.
|
||||
*
|
||||
* Looks up `fileSlug` by test case reference rather than array index — the
|
||||
* comparison key depends on getting the right slug, and zipping by index
|
||||
* silently miscompares if anything ever reorders the aggregate.
|
||||
*/
|
||||
function bucketFromEvaluation(
|
||||
evaluation: MultiRunEvaluation,
|
||||
testCasesWithFiles: WorkflowTestCaseWithFile[],
|
||||
experimentName: string,
|
||||
): ExperimentBucket {
|
||||
const slugByTestCase = new Map(
|
||||
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
|
||||
);
|
||||
const scenarios = new Map<string, ScenarioCounts>();
|
||||
const failureCategoryTotals: Record<string, number> = {};
|
||||
let trialTotal = 0;
|
||||
for (const tc of evaluation.testCases) {
|
||||
const fileSlug = slugByTestCase.get(tc.testCase);
|
||||
if (!fileSlug) {
|
||||
throw new Error(
|
||||
`bucketFromEvaluation: no fileSlug for test case "${tc.testCase.prompt.slice(0, 60)}"`,
|
||||
);
|
||||
}
|
||||
const total = tc.runs.length;
|
||||
for (const sa of tc.scenarios) {
|
||||
if (multiRun) {
|
||||
const passAtK = Math.round((sa.passAtK[metrics.kIndex] ?? 0) * 100);
|
||||
const passHatK = Math.round((sa.passHatK[metrics.kIndex] ?? 0) * 100);
|
||||
console.log(
|
||||
` ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` +
|
||||
` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`,
|
||||
);
|
||||
} else {
|
||||
const sr = sa.runs[0];
|
||||
const icon = sr.success ? '✓' : '✗';
|
||||
const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
|
||||
console.log(
|
||||
` ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'}${category} (${String(sr.score * 100)}%)`,
|
||||
);
|
||||
if (!sr.success) {
|
||||
const execErrors = sr.evalResult?.errors ?? [];
|
||||
if (execErrors.length > 0) {
|
||||
console.log(` Error: ${execErrors.join('; ').slice(0, 200)}`);
|
||||
}
|
||||
console.log(` Diagnosis: ${sr.reasoning.slice(0, 200)}`);
|
||||
const key = `${fileSlug}/${sa.scenario.name}`;
|
||||
const failureCategories: Record<string, number> = {};
|
||||
for (const sr of sa.runs) {
|
||||
trialTotal++;
|
||||
if (!sr.success && sr.failureCategory) {
|
||||
failureCategories[sr.failureCategory] = (failureCategories[sr.failureCategory] ?? 0) + 1;
|
||||
failureCategoryTotals[sr.failureCategory] =
|
||||
(failureCategoryTotals[sr.failureCategory] ?? 0) + 1;
|
||||
}
|
||||
}
|
||||
scenarios.set(key, {
|
||||
testCaseFile: fileSlug,
|
||||
scenarioName: sa.scenario.name,
|
||||
passed: sa.passCount,
|
||||
total,
|
||||
failureCategories,
|
||||
});
|
||||
}
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (multiRun) {
|
||||
console.log(
|
||||
`${String(metrics.built)}/${String(testCases.length)} built | pass@${String(totalRuns)}: ${String(Math.round(metrics.passAtK * 100))}% | pass^${String(totalRuns)}: ${String(Math.round(metrics.passHatK * 100))}% | iterations: ${metrics.passRatePerIter}`,
|
||||
);
|
||||
} else {
|
||||
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
|
||||
const passed = allScenarios.filter((s) => s.runs[0]?.success).length;
|
||||
const total = metrics.scenariosTotal;
|
||||
console.log(
|
||||
`${String(metrics.built)}/${String(testCases.length)} built | ${String(passed)}/${String(total)} passed (${String(total > 0 ? Math.round((passed / total) * 100) : 0)}%)`,
|
||||
);
|
||||
}
|
||||
return { experimentName, scenarios, failureCategoryTotals, trialTotal };
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
|
|
|
|||
333
packages/@n8n/instance-ai/evaluations/comparison/compare.ts
Normal file
333
packages/@n8n/instance-ai/evaluations/comparison/compare.ts
Normal file
|
|
@ -0,0 +1,333 @@
|
|||
// ---------------------------------------------------------------------------
|
||||
// Comparison core: take two experiment buckets, return a ComparisonResult.
|
||||
//
|
||||
// Pure function, no I/O. The tier thresholds (p-value cutoff, minimum delta,
|
||||
// minimum baseline pass rate) live in statistics.ts — there's no CLI knob.
|
||||
// Tune them there if the false-positive rate drifts.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
import {
|
||||
classifyScenario,
|
||||
wilsonInterval,
|
||||
type ClassifyOptions,
|
||||
type ScenarioClassification,
|
||||
type ScenarioVerdict,
|
||||
} from './statistics';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface ScenarioCounts {
|
||||
testCaseFile: string;
|
||||
scenarioName: string;
|
||||
passed: number;
|
||||
total: number;
|
||||
failureCategories?: Record<string, number>;
|
||||
}
|
||||
|
||||
export interface ExperimentBucket {
|
||||
experimentName: string;
|
||||
scenarios: Map<string, ScenarioCounts>;
|
||||
/**
|
||||
* Aggregated failure-category counts across all trials in all scenarios.
|
||||
* Used for the run-level failure-category drift table — orthogonal to
|
||||
* per-scenario verdicts.
|
||||
*/
|
||||
failureCategoryTotals?: Record<string, number>;
|
||||
trialTotal?: number;
|
||||
}
|
||||
|
||||
export interface ScenarioComparison extends ScenarioClassification {
|
||||
testCaseFile: string;
|
||||
scenarioName: string;
|
||||
prPasses: number;
|
||||
prTotal: number;
|
||||
baselinePasses: number;
|
||||
baselineTotal: number;
|
||||
}
|
||||
|
||||
export interface AggregateComparison {
|
||||
intersectionSize: number;
|
||||
prAggregatePassRate: number;
|
||||
baselineAggregatePassRate: number;
|
||||
prAggregateCI: { lower: number; upper: number };
|
||||
baselineAggregateCI: { lower: number; upper: number };
|
||||
delta: number;
|
||||
}
|
||||
|
||||
export interface FailureCategoryComparison {
|
||||
category: string;
|
||||
prCount: number;
|
||||
prRate: number; // count / trialTotal
|
||||
baselineCount: number;
|
||||
baselineRate: number;
|
||||
delta: number; // prRate − baselineRate
|
||||
notable: boolean;
|
||||
}
|
||||
|
||||
export interface ComparisonResult {
|
||||
pr: { experimentName: string };
|
||||
baseline: { experimentName: string };
|
||||
aggregate: AggregateComparison;
|
||||
scenarios: ScenarioComparison[];
|
||||
prOnly: Array<{ testCaseFile: string; scenarioName: string }>;
|
||||
baselineOnly: Array<{ testCaseFile: string; scenarioName: string }>;
|
||||
failureCategories: FailureCategoryComparison[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of a comparison attempt. The `kind` field distinguishes between
|
||||
* "ran successfully", "skipped intentionally" (no baseline yet, current run
|
||||
* IS the baseline), and "failed unexpectedly" (LangSmith API error, fetch
|
||||
* timeout, etc.). The PR comment renders a different alert per kind so
|
||||
* readers can tell a missing baseline from a regression-detection outage.
|
||||
*/
|
||||
export type ComparisonOutcome =
|
||||
| { kind: 'ok'; result: ComparisonResult }
|
||||
| { kind: 'no_baseline' }
|
||||
| { kind: 'self_baseline'; experimentName: string }
|
||||
| { kind: 'fetch_failed'; error: string };
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Hard regressions only — high-confidence, gating-grade flags. */
|
||||
export function hardRegressions(result: ComparisonResult): ScenarioComparison[] {
|
||||
return result.scenarios.filter((s) => s.verdict === 'hard_regression');
|
||||
}
|
||||
|
||||
/** Soft regressions — looser thresholds, worth investigating but not gating. */
|
||||
export function softRegressions(result: ComparisonResult): ScenarioComparison[] {
|
||||
return result.scenarios.filter((s) => s.verdict === 'soft_regression');
|
||||
}
|
||||
|
||||
/** Movement ≥ watchDelta without reaching a flag tier. Visibility only. */
|
||||
export function watchList(result: ComparisonResult): ScenarioComparison[] {
|
||||
return result.scenarios.filter((s) => s.verdict === 'watch');
|
||||
}
|
||||
|
||||
export function improvements(result: ComparisonResult): ScenarioComparison[] {
|
||||
return result.scenarios.filter((s) => s.verdict === 'improvement');
|
||||
}
|
||||
|
||||
export function byVerdict(result: ComparisonResult): Record<ScenarioVerdict, number> {
|
||||
const counts: Record<ScenarioVerdict, number> = {
|
||||
hard_regression: 0,
|
||||
soft_regression: 0,
|
||||
watch: 0,
|
||||
improvement: 0,
|
||||
stable: 0,
|
||||
unreliable_baseline: 0,
|
||||
insufficient_data: 0,
|
||||
};
|
||||
for (const s of result.scenarios) counts[s.verdict]++;
|
||||
return counts;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compare
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Compare two experiment buckets and produce a structured comparison result.
|
||||
*
|
||||
* Aggregate is computed over the *intersection* of scenarios — the only
|
||||
* scenarios for which the rates are directly comparable. PR-only and
|
||||
* baseline-only scenarios are surfaced separately, not folded into the
|
||||
* aggregate.
|
||||
*
|
||||
* Aggregate pass rate is the *micro* average — total passes / total trials
|
||||
* across the intersection.
|
||||
*
|
||||
* `options` exists for tests; production callers pass nothing.
|
||||
*/
|
||||
export function compareBuckets(
|
||||
pr: ExperimentBucket,
|
||||
baseline: ExperimentBucket,
|
||||
options: ClassifyOptions = {},
|
||||
): ComparisonResult {
|
||||
const scenarios: ScenarioComparison[] = [];
|
||||
const prOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
|
||||
const baselineOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
|
||||
|
||||
let prIPasses = 0;
|
||||
let prITotal = 0;
|
||||
let baseIPasses = 0;
|
||||
let baseITotal = 0;
|
||||
|
||||
for (const [key, prCounts] of pr.scenarios) {
|
||||
const baseCounts = baseline.scenarios.get(key);
|
||||
if (!baseCounts) {
|
||||
prOnly.push({
|
||||
testCaseFile: prCounts.testCaseFile,
|
||||
scenarioName: prCounts.scenarioName,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
prIPasses += prCounts.passed;
|
||||
prITotal += prCounts.total;
|
||||
baseIPasses += baseCounts.passed;
|
||||
baseITotal += baseCounts.total;
|
||||
|
||||
const classification = classifyScenario(
|
||||
prCounts.passed,
|
||||
prCounts.total,
|
||||
baseCounts.passed,
|
||||
baseCounts.total,
|
||||
options,
|
||||
);
|
||||
scenarios.push({
|
||||
testCaseFile: prCounts.testCaseFile,
|
||||
scenarioName: prCounts.scenarioName,
|
||||
prPasses: prCounts.passed,
|
||||
prTotal: prCounts.total,
|
||||
baselinePasses: baseCounts.passed,
|
||||
baselineTotal: baseCounts.total,
|
||||
...classification,
|
||||
});
|
||||
}
|
||||
|
||||
for (const [key, baseCounts] of baseline.scenarios) {
|
||||
if (!pr.scenarios.has(key)) {
|
||||
baselineOnly.push({
|
||||
testCaseFile: baseCounts.testCaseFile,
|
||||
scenarioName: baseCounts.scenarioName,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const aggregate: AggregateComparison = {
|
||||
intersectionSize: scenarios.length,
|
||||
prAggregatePassRate: rate(prIPasses, prITotal),
|
||||
baselineAggregatePassRate: rate(baseIPasses, baseITotal),
|
||||
prAggregateCI: wilsonInterval(prIPasses, prITotal),
|
||||
baselineAggregateCI: wilsonInterval(baseIPasses, baseITotal),
|
||||
delta: rate(prIPasses, prITotal) - rate(baseIPasses, baseITotal),
|
||||
};
|
||||
|
||||
scenarios.sort(scenarioComparator);
|
||||
|
||||
const failureCategories = compareFailureCategories(pr, baseline);
|
||||
|
||||
return {
|
||||
pr: { experimentName: pr.experimentName },
|
||||
baseline: { experimentName: baseline.experimentName },
|
||||
aggregate,
|
||||
scenarios,
|
||||
prOnly,
|
||||
baselineOnly,
|
||||
failureCategories,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Failure-category drift
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Min absolute rate gap to consider a category notable (5 percentage points). */
|
||||
const CATEGORY_NOTABLE_RATE_DELTA = 0.05;
|
||||
/** Min absolute trial-count gap (over scaling) required alongside the rate gap. */
|
||||
const CATEGORY_NOTABLE_COUNT_DELTA = 3;
|
||||
|
||||
/**
|
||||
* Categories the verifier is supposed to emit. Anything else (malformed
|
||||
* strings like `-`, `>builder_issue`, empty, etc.) is dropped from the
|
||||
* comparison so the PR comment doesn't display verifier noise. Keep in sync
|
||||
* with the verifier's category enum; unknown values are logged at verbose
|
||||
* level via the console (see compareFailureCategories).
|
||||
*/
|
||||
const KNOWN_FAILURE_CATEGORIES = new Set([
|
||||
'builder_issue',
|
||||
'mock_issue',
|
||||
'framework_issue',
|
||||
'verification_failure',
|
||||
'build_failure',
|
||||
]);
|
||||
|
||||
function isCategoryNotable(
|
||||
prCount: number,
|
||||
prTotal: number,
|
||||
baselineCount: number,
|
||||
baselineTotal: number,
|
||||
): boolean {
|
||||
const rateGap = Math.abs(prCount / prTotal - baselineCount / baselineTotal);
|
||||
if (rateGap < CATEGORY_NOTABLE_RATE_DELTA) return false;
|
||||
const expectedPrCount = baselineCount * (prTotal / baselineTotal);
|
||||
const countGap = Math.abs(prCount - expectedPrCount);
|
||||
return countGap >= CATEGORY_NOTABLE_COUNT_DELTA;
|
||||
}
|
||||
|
||||
function compareFailureCategories(
|
||||
pr: ExperimentBucket,
|
||||
baseline: ExperimentBucket,
|
||||
): FailureCategoryComparison[] {
|
||||
if (!pr.failureCategoryTotals || !baseline.failureCategoryTotals) return [];
|
||||
const prTotal = pr.trialTotal ?? 0;
|
||||
const baseTotal = baseline.trialTotal ?? 0;
|
||||
if (prTotal === 0 || baseTotal === 0) return [];
|
||||
|
||||
// Surface unrecognised values so we notice when the verifier adds a new
|
||||
// category (or starts emitting noise we should clean up). Doesn't enter
|
||||
// the comparison output; the renderer only knows about KNOWN_FAILURE_CATEGORIES.
|
||||
for (const category of Object.keys(pr.failureCategoryTotals)) {
|
||||
if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
|
||||
console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
|
||||
}
|
||||
}
|
||||
for (const category of Object.keys(baseline.failureCategoryTotals)) {
|
||||
if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
|
||||
console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
|
||||
}
|
||||
}
|
||||
|
||||
// Always emit a row for every known category, even if both sides are 0.
|
||||
// The renderer can decide whether to suppress 0/0 rows; this gives readers
|
||||
// a complete picture of the failure-type taxonomy by default.
|
||||
const out: FailureCategoryComparison[] = [];
|
||||
for (const category of KNOWN_FAILURE_CATEGORIES) {
|
||||
const prCount = pr.failureCategoryTotals[category] ?? 0;
|
||||
const baselineCount = baseline.failureCategoryTotals[category] ?? 0;
|
||||
out.push({
|
||||
category,
|
||||
prCount,
|
||||
prRate: prCount / prTotal,
|
||||
baselineCount,
|
||||
baselineRate: baselineCount / baseTotal,
|
||||
delta: prCount / prTotal - baselineCount / baseTotal,
|
||||
notable: isCategoryNotable(prCount, prTotal, baselineCount, baseTotal),
|
||||
});
|
||||
}
|
||||
|
||||
// Sort: notable first, then by absolute delta descending.
|
||||
out.sort((a, b) => {
|
||||
if (a.notable !== b.notable) return a.notable ? -1 : 1;
|
||||
return Math.abs(b.delta) - Math.abs(a.delta);
|
||||
});
|
||||
return out;
|
||||
}
|
||||
|
||||
function rate(passes: number, total: number): number {
|
||||
return total > 0 ? passes / total : 0;
|
||||
}
|
||||
|
||||
const VERDICT_ORDER: Record<ScenarioComparison['verdict'], number> = {
|
||||
hard_regression: 0,
|
||||
soft_regression: 1,
|
||||
improvement: 2,
|
||||
watch: 3,
|
||||
unreliable_baseline: 4,
|
||||
stable: 5,
|
||||
insufficient_data: 6,
|
||||
};
|
||||
|
||||
function scenarioComparator(a: ScenarioComparison, b: ScenarioComparison): number {
|
||||
const av = VERDICT_ORDER[a.verdict];
|
||||
const bv = VERDICT_ORDER[b.verdict];
|
||||
if (av !== bv) return av - bv;
|
||||
const fileCmp = a.testCaseFile.localeCompare(b.testCaseFile);
|
||||
if (fileCmp !== 0) return fileCmp;
|
||||
return a.scenarioName.localeCompare(b.scenarioName);
|
||||
}
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
// ---------------------------------------------------------------------------
|
||||
// Find and fetch the pinned baseline experiment from LangSmith.
|
||||
//
|
||||
// The baseline is whichever experiment most recently used the
|
||||
// `instance-ai-baseline` prefix. To refresh, run the eval with that prefix:
|
||||
//
|
||||
// pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
|
||||
//
|
||||
// LangSmith appends a random suffix, so successive baseline runs become
|
||||
// `instance-ai-baseline-7abc1234`, `instance-ai-baseline-9def5678`, etc.
|
||||
// We pick the most recently started one.
|
||||
//
|
||||
// Two functions, both small:
|
||||
//
|
||||
// findLatestBaseline — list baseline-prefixed projects, pick newest.
|
||||
// fetchBaselineBucket — read its root runs, bucket per scenario.
|
||||
//
|
||||
// Both throw on transport errors. Callers are expected to swallow with a log:
|
||||
// the comparison is advisory and shouldn't fail the eval run.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
import type { Client } from 'langsmith';
|
||||
import { z } from 'zod';
|
||||
|
||||
import type { ExperimentBucket, ScenarioCounts } from './compare';
|
||||
|
||||
/**
|
||||
* Prefix the latest-baseline lookup matches against. The CLI flag
|
||||
* `--experiment-name instance-ai-baseline` produces project names like
|
||||
* `instance-ai-baseline-7abc1234` (LangSmith appends a hyphen + suffix), so
|
||||
* the constant must end in `-` to avoid matching unrelated names that
|
||||
* happen to start with `instance-ai-baseline...`.
|
||||
*/
|
||||
export const BASELINE_EXPERIMENT_PREFIX = 'instance-ai-baseline-';
|
||||
|
||||
const inputsSchema = z
|
||||
.object({
|
||||
testCaseFile: z.string().default(''),
|
||||
scenarioName: z.string().default(''),
|
||||
})
|
||||
.passthrough();
|
||||
|
||||
const outputsSchema = z
|
||||
.object({
|
||||
passed: z.boolean().default(false),
|
||||
failureCategory: z.string().optional(),
|
||||
})
|
||||
.passthrough();
|
||||
|
||||
/**
|
||||
* Return the most recently created baseline experiment, or `undefined` if
|
||||
* none exist. We pick by `start_time` so a re-run of an older snapshot
|
||||
* doesn't displace the latest one.
|
||||
*/
|
||||
export async function findLatestBaseline(client: Client): Promise<string | undefined> {
|
||||
let latest: { name: string; ts: number } | undefined;
|
||||
for await (const project of client.listProjects({ nameContains: BASELINE_EXPERIMENT_PREFIX })) {
|
||||
const name = project.name;
|
||||
if (!name?.startsWith(BASELINE_EXPERIMENT_PREFIX)) continue;
|
||||
const ts = project.start_time ? new Date(project.start_time).getTime() : 0;
|
||||
if (!latest || ts > latest.ts) latest = { name, ts };
|
||||
}
|
||||
return latest?.name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a baseline experiment's per-scenario pass/fail counts. Each root run
|
||||
* corresponds to one (testCaseFile, scenarioName, iteration) triple — we
|
||||
* bucket by `${testCaseFile}/${scenarioName}` and accumulate.
|
||||
*
|
||||
* Throws if the project does not exist.
|
||||
*/
|
||||
export async function fetchBaselineBucket(
|
||||
client: Client,
|
||||
experimentName: string,
|
||||
): Promise<ExperimentBucket> {
|
||||
const project = await client.readProject({ projectName: experimentName });
|
||||
const scenarios = new Map<string, ScenarioCounts>();
|
||||
const failureCategoryTotals: Record<string, number> = {};
|
||||
let trialTotal = 0;
|
||||
|
||||
for await (const run of client.listRuns({ projectId: project.id, isRoot: true })) {
|
||||
const inputs = inputsSchema.safeParse(run.inputs ?? {});
|
||||
if (!inputs.success || !inputs.data.testCaseFile || !inputs.data.scenarioName) continue;
|
||||
// Skip runs that never produced outputs (still running, crashed before
|
||||
// completion, infra error). Without this guard, every field defaults
|
||||
// (passed → false) would coerce them into "failed" trials and inflate
|
||||
// the baseline failure count. Mirrors `parseTargetOutput` in cli/index.ts.
|
||||
const rawOutputs = run.outputs;
|
||||
if (
|
||||
rawOutputs === null ||
|
||||
rawOutputs === undefined ||
|
||||
typeof rawOutputs !== 'object' ||
|
||||
Object.keys(rawOutputs).length === 0
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
const outputs = outputsSchema.safeParse(rawOutputs);
|
||||
if (!outputs.success) continue;
|
||||
|
||||
const key = `${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
|
||||
const existing: ScenarioCounts = scenarios.get(key) ?? {
|
||||
testCaseFile: inputs.data.testCaseFile,
|
||||
scenarioName: inputs.data.scenarioName,
|
||||
passed: 0,
|
||||
total: 0,
|
||||
failureCategories: {},
|
||||
};
|
||||
existing.total++;
|
||||
trialTotal++;
|
||||
if (outputs.data.passed) {
|
||||
existing.passed++;
|
||||
} else if (outputs.data.failureCategory) {
|
||||
const cat = outputs.data.failureCategory;
|
||||
existing.failureCategories = existing.failureCategories ?? {};
|
||||
existing.failureCategories[cat] = (existing.failureCategories[cat] ?? 0) + 1;
|
||||
failureCategoryTotals[cat] = (failureCategoryTotals[cat] ?? 0) + 1;
|
||||
}
|
||||
scenarios.set(key, existing);
|
||||
}
|
||||
|
||||
return { experimentName, scenarios, failureCategoryTotals, trialTotal };
|
||||
}
|
||||
961
packages/@n8n/instance-ai/evaluations/comparison/format.ts
Normal file
961
packages/@n8n/instance-ai/evaluations/comparison/format.ts
Normal file
|
|
@ -0,0 +1,961 @@
|
|||
// ---------------------------------------------------------------------------
|
||||
// Render the eval run as a PR comment (markdown) or a console summary
|
||||
// (aligned plain text). Both formats are driven by:
|
||||
//
|
||||
// - MultiRunEvaluation — pass rates, build counts, per-trial reasoning
|
||||
// - ComparisonOutcome (optional) — tagged result of the baseline
|
||||
// comparison: `ok` (ran, has scenarios), `no_baseline` (skipped), or
|
||||
// `fetch_failed` / `self_baseline` (skipped for cause). Each kind
|
||||
// drives a distinct top-of-comment alert so a LangSmith outage doesn't
|
||||
// get dressed up as "no baseline configured".
|
||||
//
|
||||
// When no comparison is available (no baseline yet, LangSmith offline)
|
||||
// the renderers still produce a useful per-test-case summary. When a
|
||||
// comparison is available, sections render in priority order:
|
||||
// regressions, soft regressions, notable movement, improvements,
|
||||
// failure-category drift. Only sections with content are emitted.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
import {
|
||||
hardRegressions,
|
||||
improvements,
|
||||
softRegressions,
|
||||
watchList,
|
||||
type ComparisonOutcome,
|
||||
type ComparisonResult,
|
||||
type FailureCategoryComparison,
|
||||
type ScenarioComparison,
|
||||
} from './compare';
|
||||
import type {
|
||||
MultiRunEvaluation,
|
||||
TestCaseAggregation,
|
||||
WorkflowTestCase,
|
||||
WorkflowTestCaseResult,
|
||||
} from '../types';
|
||||
|
||||
interface FormatOptions {
|
||||
/** Optional commit SHA to include in the heading. Truncated to 8 chars. */
|
||||
commitSha?: string;
|
||||
/** Maps each test-case reference to its file slug. When provided, the
|
||||
* per-scenario failure breakdown looks up failed runs by
|
||||
* `${fileSlug}/${scenarioName}` — deterministic across collisions like
|
||||
* multiple `happy-path` scenarios. When omitted, the breakdown is
|
||||
* skipped (no name-only fallback — that lookup was wrong on real data). */
|
||||
slugByTestCase?: Map<WorkflowTestCase, string>;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Markdown PR comment
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function formatComparisonMarkdown(
|
||||
evaluation: MultiRunEvaluation,
|
||||
outcome?: ComparisonOutcome,
|
||||
options: FormatOptions = {},
|
||||
): string {
|
||||
const lines: string[] = [];
|
||||
const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
|
||||
|
||||
lines.push(formatHeading(options.commitSha));
|
||||
lines.push('');
|
||||
lines.push(formatTopAlert(outcome));
|
||||
lines.push('');
|
||||
lines.push(formatAggregateBlock(evaluation, comparison));
|
||||
lines.push('');
|
||||
|
||||
if (comparison) {
|
||||
const hard = hardRegressions(comparison);
|
||||
const soft = softRegressions(comparison);
|
||||
const watch = watchList(comparison);
|
||||
const imps = improvements(comparison);
|
||||
|
||||
const renderedAnyTable = hard.length > 0 || soft.length > 0 || imps.length > 0;
|
||||
|
||||
// Built once and reused across the regression-tier sections so each
|
||||
// scenario row can carry a collapsible breakdown of its failed PR runs.
|
||||
// Improvements skip the breakdown — they passed. Skipped entirely when
|
||||
// the caller didn't pass a slug map (lookup would be ambiguous).
|
||||
const failedIndex = options.slugByTestCase
|
||||
? buildFailedRunsIndex(evaluation, options.slugByTestCase)
|
||||
: undefined;
|
||||
|
||||
if (hard.length > 0) {
|
||||
lines.push(
|
||||
...renderScenarioSection('Regressions', '— high-confidence', hard, true, failedIndex),
|
||||
);
|
||||
}
|
||||
if (soft.length > 0) {
|
||||
lines.push(
|
||||
...renderScenarioSection(
|
||||
'Soft regressions',
|
||||
'— investigate if related to your changes',
|
||||
soft,
|
||||
true,
|
||||
failedIndex,
|
||||
),
|
||||
);
|
||||
}
|
||||
if (watch.length > 0) {
|
||||
lines.push(
|
||||
...renderScenarioSection(
|
||||
'Notable movement',
|
||||
'— large gap, no statistical flag',
|
||||
watch,
|
||||
false,
|
||||
failedIndex,
|
||||
),
|
||||
);
|
||||
}
|
||||
if (imps.length > 0) {
|
||||
lines.push(...renderScenarioSection('Improvements', '', imps, true));
|
||||
}
|
||||
|
||||
if (renderedAnyTable) {
|
||||
lines.push(
|
||||
"_p = Fisher's exact one-sided p-value. Lower = stronger evidence of a real change._",
|
||||
);
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
// Always render the breakdown when comparison data is available — the
|
||||
// renderer drops 0/0 rows itself, so empty categories don't pollute
|
||||
// the output but the reader still sees the full taxonomy of what's
|
||||
// tracked.
|
||||
lines.push(...renderFailureCategorySection(comparison.failureCategories));
|
||||
}
|
||||
|
||||
lines.push(...renderPerTestCaseDetails(evaluation, options.slugByTestCase));
|
||||
|
||||
if (comparison) {
|
||||
const otherFindings = renderOtherFindings(comparison);
|
||||
if (otherFindings.length > 0) lines.push(...otherFindings);
|
||||
}
|
||||
|
||||
const failureDetails = renderFailureDetails(evaluation, options.slugByTestCase);
|
||||
if (failureDetails.length > 0) lines.push(...failureDetails);
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
function formatHeading(commitSha?: string): string {
|
||||
const sha = commitSha ? ` — \`${commitSha.slice(0, 8)}\`` : '';
|
||||
return `### Instance AI Workflow Eval${sha}`;
|
||||
}
|
||||
|
||||
function formatTopAlert(outcome?: ComparisonOutcome): string {
|
||||
if (!outcome) {
|
||||
return ['> [!NOTE]', '> No baseline comparison ran (LangSmith disabled for this run).'].join(
|
||||
'\n',
|
||||
);
|
||||
}
|
||||
|
||||
if (outcome.kind === 'no_baseline') {
|
||||
return [
|
||||
'> [!NOTE]',
|
||||
'> No baseline configured — comparison skipped. Run the eval with `--experiment-name instance-ai-baseline` on master to create one.',
|
||||
].join('\n');
|
||||
}
|
||||
if (outcome.kind === 'self_baseline') {
|
||||
return [
|
||||
'> [!NOTE]',
|
||||
`> This run is the baseline (\`${outcome.experimentName}\`) — nothing to compare against.`,
|
||||
].join('\n');
|
||||
}
|
||||
if (outcome.kind === 'fetch_failed') {
|
||||
return [
|
||||
'> [!WARNING]',
|
||||
`> Regression detection did not run — baseline fetch failed: ${outcome.error}`,
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
const comparison = outcome.result;
|
||||
const hard = hardRegressions(comparison).length;
|
||||
const soft = softRegressions(comparison).length;
|
||||
const watch = watchList(comparison).length;
|
||||
const imps = improvements(comparison).length;
|
||||
const stable = countByVerdict(comparison, 'stable');
|
||||
|
||||
const aggDelta = comparison.aggregate.delta * 100;
|
||||
const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
|
||||
|
||||
// Always include all five tier counts so readers see what's being tracked,
|
||||
// not just what's > 0. The hard count is bolded when nonzero for emphasis.
|
||||
const summary = [
|
||||
hard > 0 ? `**${hard} regression${hard === 1 ? '' : 's'}**` : '0 regressions',
|
||||
`${soft} soft`,
|
||||
`${watch} notable`,
|
||||
`${imps} improvement${imps === 1 ? '' : 's'}`,
|
||||
`${stable} stable`,
|
||||
].join(', ');
|
||||
|
||||
let icon: string;
|
||||
let alertKind: 'CAUTION' | 'WARNING' | 'NOTE' | 'TIP';
|
||||
|
||||
if (hard > 0) {
|
||||
icon = '🔴';
|
||||
alertKind = 'CAUTION';
|
||||
} else if (soft > 0) {
|
||||
icon = '🟡';
|
||||
alertKind = 'WARNING';
|
||||
} else if (watch > 0) {
|
||||
icon = '🔵';
|
||||
alertKind = 'NOTE';
|
||||
} else {
|
||||
icon = '🟢';
|
||||
alertKind = 'TIP';
|
||||
}
|
||||
|
||||
return `> [!${alertKind}]\n> ${icon} ${summary}. Pass rate ${aggDeltaText} vs master.`;
|
||||
}
|
||||
|
||||
function formatAggregateBlock(
|
||||
evaluation: MultiRunEvaluation,
|
||||
comparison?: ComparisonResult,
|
||||
): string {
|
||||
if (!comparison) {
|
||||
const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
|
||||
const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
|
||||
const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
|
||||
const rate = total > 0 ? (passed / total) * 100 : 0;
|
||||
return `**Aggregate**: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`;
|
||||
}
|
||||
|
||||
const { aggregate } = comparison;
|
||||
const delta = aggregate.delta * 100;
|
||||
const sign = delta >= 0 ? '+' : '';
|
||||
const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
|
||||
|
||||
const baselineN = inferBaselineN(comparison);
|
||||
const sampleLine = baselineN
|
||||
? `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) vs N=${baselineN} (baseline) · baseline: \`${comparison.baseline.experimentName}\`_`
|
||||
: `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) · baseline: \`${comparison.baseline.experimentName}\`_`;
|
||||
|
||||
const partial = comparison.baselineOnly.length + comparison.prOnly.length;
|
||||
const partialNote =
|
||||
partial > 0
|
||||
? `\n_Partial: ${[
|
||||
comparison.baselineOnly.length > 0
|
||||
? `${comparison.baselineOnly.length} baseline scenarios not run by PR`
|
||||
: null,
|
||||
comparison.prOnly.length > 0
|
||||
? `${comparison.prOnly.length} PR scenarios have no baseline data (added since baseline captured)`
|
||||
: null,
|
||||
]
|
||||
.filter((s) => s !== null)
|
||||
.join(', ')}._`
|
||||
: '';
|
||||
|
||||
return [
|
||||
`**Aggregate**: ${pct(aggregate.prAggregatePassRate)}% PR vs ${pct(aggregate.baselineAggregatePassRate)}% baseline — **${sign}${delta.toFixed(1)}pp${arrow}**`,
|
||||
sampleLine + partialNote,
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function renderScenarioSection(
|
||||
heading: string,
|
||||
subtitle: string,
|
||||
scenarios: ScenarioComparison[],
|
||||
withPValue: boolean,
|
||||
failedIndex?: FailedRunsBySlug,
|
||||
): string[] {
|
||||
const lines: string[] = [];
|
||||
const headingLine = subtitle
|
||||
? `#### ${heading} (${scenarios.length}) ${subtitle}`
|
||||
: `#### ${heading} (${scenarios.length})`;
|
||||
lines.push(headingLine);
|
||||
lines.push('');
|
||||
if (withPValue) {
|
||||
lines.push('| Scenario | PR | Baseline | Δ | p |');
|
||||
lines.push('|---|---|---|---|---|');
|
||||
} else {
|
||||
lines.push('| Scenario | PR | Baseline | Δ |');
|
||||
lines.push('|---|---|---|---|');
|
||||
}
|
||||
for (const s of scenarios) {
|
||||
const cells = [
|
||||
`\`${s.testCaseFile}/${s.scenarioName}\``,
|
||||
formatRateCell(s.prPasses, s.prTotal),
|
||||
formatRateCell(s.baselinePasses, s.baselineTotal),
|
||||
formatDeltaCell(s.delta),
|
||||
];
|
||||
if (withPValue) {
|
||||
const p = s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft;
|
||||
cells.push(p.toFixed(3));
|
||||
}
|
||||
lines.push(`| ${cells.join(' | ')} |`);
|
||||
}
|
||||
lines.push('');
|
||||
|
||||
// Per-scenario failure breakdown — one collapsible per row that had failed
|
||||
// PR runs. Lets the reader drill into each flagged scenario without
|
||||
// hunting through a separate "Failure details" section.
|
||||
if (failedIndex) {
|
||||
for (const s of scenarios) {
|
||||
const failedRuns = failedIndex.get(`${s.testCaseFile}/${s.scenarioName}`) ?? [];
|
||||
if (failedRuns.length === 0) continue;
|
||||
lines.push(...renderScenarioFailureBreakdown(s, failedRuns));
|
||||
}
|
||||
}
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
function renderScenarioFailureBreakdown(
|
||||
s: ScenarioComparison,
|
||||
failedRuns: FailedRunDetail[],
|
||||
): string[] {
|
||||
const slug = `${s.testCaseFile}/${s.scenarioName}`;
|
||||
const categoryMix = summarizeCategories(failedRuns);
|
||||
const summaryParts = [`${failedRuns.length} of ${s.prTotal} failed`];
|
||||
if (categoryMix) summaryParts.push(categoryMix);
|
||||
|
||||
const lines: string[] = [];
|
||||
lines.push(`<details><summary><code>${slug}</code> — ${summaryParts.join(' · ')}</summary>`);
|
||||
lines.push('');
|
||||
for (const fr of failedRuns) {
|
||||
const tag = fr.category ? ` [${fr.category}]` : '';
|
||||
lines.push(`> Run ${fr.runIndex}${tag}: ${fr.reasoning.slice(0, 300)}`);
|
||||
lines.push('>');
|
||||
}
|
||||
// Drop the trailing empty quote line.
|
||||
if (lines[lines.length - 1] === '>') lines.pop();
|
||||
lines.push('');
|
||||
lines.push('</details>');
|
||||
lines.push('');
|
||||
return lines;
|
||||
}
|
||||
|
||||
function renderFailureCategorySection(categories: FailureCategoryComparison[]): string[] {
|
||||
// Drop rows that are 0/0 on both sides — they carry no signal for the
|
||||
// reader. Categories with non-zero count on either side are kept so the
|
||||
// reader sees the full picture even if not "notable".
|
||||
const rows = categories.filter((c) => c.prCount > 0 || c.baselineCount > 0);
|
||||
if (rows.length === 0) return [];
|
||||
|
||||
const lines: string[] = [];
|
||||
lines.push('#### Failure breakdown');
|
||||
lines.push('');
|
||||
lines.push('| Category | PR | Baseline | Δ | |');
|
||||
lines.push('|---|---|---|---|---|');
|
||||
for (const c of rows) {
|
||||
const isNew = c.baselineCount === 0 && c.prCount > 0;
|
||||
const label = isNew ? `\`${c.category}\` 🆕` : `\`${c.category}\``;
|
||||
const delta = c.delta * 100;
|
||||
const sign = delta >= 0 ? '+' : '';
|
||||
const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
|
||||
const notableMarker = c.notable ? '**notable**' : '';
|
||||
lines.push(
|
||||
`| ${label} | ${c.prCount} (${pct(c.prRate)}%) | ${c.baselineCount} (${pct(c.baselineRate)}%) | ${sign}${delta.toFixed(1)}pp${arrow} | ${notableMarker} |`,
|
||||
);
|
||||
}
|
||||
lines.push('');
|
||||
return lines;
|
||||
}
|
||||
|
||||
function renderPerTestCaseDetails(
|
||||
evaluation: MultiRunEvaluation,
|
||||
slugByTestCase?: Map<WorkflowTestCase, string>,
|
||||
): string[] {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
if (testCases.length === 0) return [];
|
||||
const lines: string[] = [];
|
||||
lines.push(`<details><summary>Per-test-case results (${testCases.length})</summary>`);
|
||||
lines.push('');
|
||||
const renderName = (tc: TestCaseAggregation): string => {
|
||||
const slug = slugByTestCase?.get(tc.testCase);
|
||||
return slug ? `\`${slug}\`` : `\`${tc.testCase.prompt.slice(0, 70)}\``;
|
||||
};
|
||||
if (totalRuns > 1) {
|
||||
lines.push(`| Workflow | Built | pass@${totalRuns} | pass^${totalRuns} |`);
|
||||
lines.push('|---|---|---|---|');
|
||||
for (const tc of testCases) {
|
||||
const meanPassAtK = tc.scenarios.length
|
||||
? Math.round(
|
||||
(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
|
||||
tc.scenarios.length) *
|
||||
100,
|
||||
)
|
||||
: 0;
|
||||
const meanPassHatK = tc.scenarios.length
|
||||
? Math.round(
|
||||
(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
|
||||
tc.scenarios.length) *
|
||||
100,
|
||||
)
|
||||
: 0;
|
||||
lines.push(
|
||||
`| ${renderName(tc)} | ${tc.buildSuccessCount}/${totalRuns} | ${meanPassAtK}% | ${meanPassHatK}% |`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
lines.push('| Workflow | Built | Pass rate |');
|
||||
lines.push('|---|---|---|');
|
||||
for (const tc of testCases) {
|
||||
const built = tc.runs[0]?.workflowBuildSuccess ? '✓' : '✗';
|
||||
const passed = tc.scenarios.filter((sa) => sa.runs[0]?.success).length;
|
||||
const total = tc.scenarios.length;
|
||||
lines.push(`| ${renderName(tc)} | ${built} | ${passed}/${total} |`);
|
||||
}
|
||||
}
|
||||
lines.push('');
|
||||
lines.push('</details>');
|
||||
lines.push('');
|
||||
return lines;
|
||||
}
|
||||
|
||||
function renderOtherFindings(comparison: ComparisonResult): string[] {
|
||||
const stable = countByVerdict(comparison, 'stable');
|
||||
const flaky = countByVerdict(comparison, 'unreliable_baseline');
|
||||
const noData = countByVerdict(comparison, 'insufficient_data');
|
||||
if (stable === 0 && flaky === 0 && noData === 0) return [];
|
||||
|
||||
const summaryParts: string[] = [];
|
||||
if (flaky > 0) summaryParts.push(`${flaky} on flaky baseline`);
|
||||
if (noData > 0) summaryParts.push(`${noData} no data`);
|
||||
if (stable > 0) summaryParts.push(`${stable} stable`);
|
||||
const summary = summaryParts.join(' · ');
|
||||
|
||||
const lines: string[] = [];
|
||||
lines.push(`<details><summary>Other findings: ${summary}</summary>`);
|
||||
lines.push('');
|
||||
|
||||
const stableScenarios = comparison.scenarios.filter((s) => s.verdict === 'stable');
|
||||
const flakyScenarios = comparison.scenarios.filter((s) => s.verdict === 'unreliable_baseline');
|
||||
const noDataScenarios = comparison.scenarios.filter((s) => s.verdict === 'insufficient_data');
|
||||
|
||||
if (flakyScenarios.length > 0) {
|
||||
lines.push('**Confident drop on a flaky baseline (surfaced for visibility, not flagged):**');
|
||||
lines.push('');
|
||||
lines.push('| Scenario | PR | Baseline | Δ |');
|
||||
lines.push('|---|---|---|---|');
|
||||
for (const s of flakyScenarios) {
|
||||
lines.push(
|
||||
`| \`${s.testCaseFile}/${s.scenarioName}\` | ${formatRateCell(s.prPasses, s.prTotal)} | ${formatRateCell(s.baselinePasses, s.baselineTotal)} | ${formatDeltaCell(s.delta)} |`,
|
||||
);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
if (noDataScenarios.length > 0) {
|
||||
lines.push(
|
||||
`**No data:** ${noDataScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ')}`,
|
||||
);
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
if (stableScenarios.length > 0) {
|
||||
lines.push(`**Stable (${stableScenarios.length}):**`);
|
||||
lines.push(
|
||||
stableScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ') + '.',
|
||||
);
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
lines.push('</details>');
|
||||
lines.push('');
|
||||
return lines;
|
||||
}
|
||||
|
||||
function renderFailureDetails(
|
||||
evaluation: MultiRunEvaluation,
|
||||
slugByTestCase?: Map<WorkflowTestCase, string>,
|
||||
): string[] {
|
||||
const failed: Array<{
|
||||
tc: WorkflowTestCaseResult;
|
||||
fileSlug: string | undefined;
|
||||
scenarioName: string;
|
||||
failedRuns: Array<{ category?: string; reasoning: string }>;
|
||||
}> = [];
|
||||
for (const tc of evaluation.testCases) {
|
||||
const fileSlug = slugByTestCase?.get(tc.testCase);
|
||||
for (const sa of tc.scenarios) {
|
||||
const failedRuns = sa.runs
|
||||
.filter((r) => !r.success)
|
||||
.map((r) => ({ category: r.failureCategory, reasoning: r.reasoning }));
|
||||
if (failedRuns.length > 0) {
|
||||
failed.push({ tc: tc.runs[0], fileSlug, scenarioName: sa.scenario.name, failedRuns });
|
||||
}
|
||||
}
|
||||
}
|
||||
if (failed.length === 0) return [];
|
||||
|
||||
const lines: string[] = [];
|
||||
lines.push('<details><summary>Failure details</summary>');
|
||||
lines.push('');
|
||||
for (const { tc, fileSlug, scenarioName, failedRuns } of failed) {
|
||||
const slug = fileSlug
|
||||
? `${fileSlug}/${scenarioName}`
|
||||
: `${tc.testCase.prompt.slice(0, 50).trim()} / ${scenarioName}`;
|
||||
lines.push(`**\`${slug}\`** — ${failedRuns.length} failed`);
|
||||
for (const fr of failedRuns) {
|
||||
const tag = fr.category ? ` [${fr.category}]` : '';
|
||||
lines.push(`> Run${tag}: ${fr.reasoning.slice(0, 200)}`);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
lines.push('</details>');
|
||||
lines.push('');
|
||||
return lines;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-scenario failure lookup
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// The comparison carries per-scenario counts (passed / total) but not the
|
||||
// underlying reasoning text. The evaluation has the reasoning, but keys
|
||||
// testCases by reference identity — not by the `testCaseFile` slug used in
|
||||
// the comparison. The slug map (built in cli/index.ts where the file slugs
|
||||
// are first known) bridges the two so the lookup is deterministic. Without
|
||||
// it we'd have to disambiguate by scenarioName alone, which collides on
|
||||
// reused names (`happy-path` shows up across most workflows).
|
||||
|
||||
interface FailedRunDetail {
|
||||
category?: string;
|
||||
reasoning: string;
|
||||
runIndex: number; // 1-based for display
|
||||
}
|
||||
|
||||
type FailedRunsBySlug = Map<string, FailedRunDetail[]>;
|
||||
|
||||
function buildFailedRunsIndex(
|
||||
evaluation: MultiRunEvaluation,
|
||||
slugByTestCase: Map<WorkflowTestCase, string>,
|
||||
): FailedRunsBySlug {
|
||||
const map: FailedRunsBySlug = new Map();
|
||||
for (const tc of evaluation.testCases) {
|
||||
const fileSlug = slugByTestCase.get(tc.testCase);
|
||||
if (!fileSlug) continue; // testCase not in the slug map — skip rather than misattribute
|
||||
for (const sa of tc.scenarios) {
|
||||
const failedRuns: FailedRunDetail[] = [];
|
||||
sa.runs.forEach((r, i) => {
|
||||
if (!r.success) {
|
||||
failedRuns.push({
|
||||
category: r.failureCategory,
|
||||
reasoning: r.reasoning,
|
||||
runIndex: i + 1,
|
||||
});
|
||||
}
|
||||
});
|
||||
if (failedRuns.length > 0) {
|
||||
map.set(`${fileSlug}/${sa.scenario.name}`, failedRuns);
|
||||
}
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
function summarizeCategories(failedRuns: FailedRunDetail[]): string | undefined {
|
||||
const counts = new Map<string, number>();
|
||||
for (const fr of failedRuns) {
|
||||
if (fr.category) counts.set(fr.category, (counts.get(fr.category) ?? 0) + 1);
|
||||
}
|
||||
if (counts.size === 0) return undefined;
|
||||
return [...counts.entries()]
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.map(([cat, n]) => `${n}× ${cat}`)
|
||||
.join(', ');
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function pct(rate: number): string {
|
||||
return (rate * 100).toFixed(1);
|
||||
}
|
||||
|
||||
function formatRateCell(passes: number, total: number): string {
|
||||
const rate = total > 0 ? Math.round((passes / total) * 100) : 0;
|
||||
return `${passes}/${total} (${rate}%)`;
|
||||
}
|
||||
|
||||
function formatDeltaCell(delta: number): string {
|
||||
const pp = delta * 100;
|
||||
const sign = pp >= 0 ? '+' : '';
|
||||
const arrow = pp > 0 ? ' ↑' : pp < 0 ? ' ↓' : '';
|
||||
return `${sign}${pp.toFixed(0)}pp${arrow}`;
|
||||
}
|
||||
|
||||
function countByVerdict(
|
||||
comparison: ComparisonResult,
|
||||
verdict: ScenarioComparison['verdict'],
|
||||
): number {
|
||||
return comparison.scenarios.filter((s) => s.verdict === verdict).length;
|
||||
}
|
||||
|
||||
/** Best-effort N=baseline iteration count. The comparison only carries trial
|
||||
* totals per scenario; we infer N from the most-common scenario total since
|
||||
* the baseline runs every scenario the same number of times. */
|
||||
function inferBaselineN(comparison: ComparisonResult): number | undefined {
|
||||
const totals = comparison.scenarios
|
||||
.filter((s) => s.baselineTotal > 0)
|
||||
.map((s) => s.baselineTotal);
|
||||
if (totals.length === 0) return undefined;
|
||||
const counts = new Map<number, number>();
|
||||
for (const t of totals) counts.set(t, (counts.get(t) ?? 0) + 1);
|
||||
let best = totals[0];
|
||||
let bestCount = 0;
|
||||
for (const [n, c] of counts) {
|
||||
if (c > bestCount) {
|
||||
best = n;
|
||||
bestCount = c;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Terminal renderer: aligned plain text for the eval CLI's end-of-run print.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const TERMINAL_INDENT = ' ';
|
||||
const TERMINAL_TABLE_INDENT = ' ';
|
||||
|
||||
export function formatComparisonTerminal(
|
||||
evaluation: MultiRunEvaluation,
|
||||
outcome?: ComparisonOutcome,
|
||||
options: FormatOptions = {},
|
||||
): string {
|
||||
const lines: string[] = [];
|
||||
const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
|
||||
|
||||
const titleSuffix = options.commitSha ? ` — ${options.commitSha.slice(0, 8)}` : '';
|
||||
const title = `Instance AI Workflow Eval${titleSuffix}`;
|
||||
lines.push(title);
|
||||
lines.push('═'.repeat(title.length));
|
||||
|
||||
lines.push(TERMINAL_INDENT + formatTerminalVerdictLine(outcome));
|
||||
lines.push('');
|
||||
|
||||
lines.push(...formatTerminalAggregate(evaluation, comparison));
|
||||
lines.push('');
|
||||
|
||||
lines.push(...formatTerminalPerTestCase(evaluation, options.slugByTestCase));
|
||||
|
||||
if (comparison) {
|
||||
const hard = hardRegressions(comparison);
|
||||
const soft = softRegressions(comparison);
|
||||
const watch = watchList(comparison);
|
||||
const imps = improvements(comparison);
|
||||
|
||||
if (hard.length > 0) {
|
||||
lines.push(
|
||||
TERMINAL_INDENT +
|
||||
'REGRESSIONS (high-confidence: large drop on a reliable scenario, unlikely noise)',
|
||||
);
|
||||
lines.push(formatTerminalScenarioTable(hard, true));
|
||||
lines.push('');
|
||||
}
|
||||
if (soft.length > 0) {
|
||||
lines.push(
|
||||
TERMINAL_INDENT +
|
||||
'SOFT REGRESSIONS (likely natural variance — investigate if related to your changes)',
|
||||
);
|
||||
lines.push(formatTerminalScenarioTable(soft, true));
|
||||
lines.push('');
|
||||
}
|
||||
if (watch.length > 0) {
|
||||
lines.push(TERMINAL_INDENT + 'NOTABLE MOVEMENT (large gap, no statistical flag)');
|
||||
lines.push(formatTerminalScenarioTable(watch, false));
|
||||
lines.push('');
|
||||
}
|
||||
if (imps.length > 0) {
|
||||
lines.push(TERMINAL_INDENT + 'IMPROVEMENTS');
|
||||
lines.push(formatTerminalScenarioTable(imps, true));
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
// Always render the breakdown when comparison data is available — same
|
||||
// rationale as the markdown side. The terminal table drops 0/0 rows
|
||||
// itself.
|
||||
const breakdownRows = comparison.failureCategories.filter(
|
||||
(c) => c.prCount > 0 || c.baselineCount > 0,
|
||||
);
|
||||
if (breakdownRows.length > 0) {
|
||||
lines.push(TERMINAL_INDENT + 'failure breakdown');
|
||||
lines.push(formatTerminalCategoryTable(breakdownRows));
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
// Stable count is already in the verdict line; surface only the rarer
|
||||
// outcomes here.
|
||||
const flaky = countByVerdict(comparison, 'unreliable_baseline');
|
||||
const noData = countByVerdict(comparison, 'insufficient_data');
|
||||
const otherParts: string[] = [];
|
||||
if (flaky > 0) otherParts.push(`${flaky} on flaky baseline`);
|
||||
if (noData > 0) otherParts.push(`${noData} no data`);
|
||||
if (otherParts.length > 0) {
|
||||
lines.push(TERMINAL_INDENT + 'other: ' + otherParts.join(' · '));
|
||||
}
|
||||
}
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
function formatTerminalVerdictLine(outcome?: ComparisonOutcome): string {
|
||||
if (!outcome) return '▶ No baseline comparison ran (LangSmith disabled).';
|
||||
if (outcome.kind === 'no_baseline') {
|
||||
return '▶ No baseline configured — comparison skipped.';
|
||||
}
|
||||
if (outcome.kind === 'self_baseline') {
|
||||
return `▶ This run is the baseline (${outcome.experimentName}) — nothing to compare.`;
|
||||
}
|
||||
if (outcome.kind === 'fetch_failed') {
|
||||
return `▶ Regression detection did not run — baseline fetch failed: ${outcome.error}`;
|
||||
}
|
||||
|
||||
const comparison = outcome.result;
|
||||
const hard = hardRegressions(comparison).length;
|
||||
const soft = softRegressions(comparison).length;
|
||||
const watch = watchList(comparison).length;
|
||||
const imps = improvements(comparison).length;
|
||||
const stable = countByVerdict(comparison, 'stable');
|
||||
|
||||
const aggDelta = comparison.aggregate.delta * 100;
|
||||
const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
|
||||
|
||||
const summary = [
|
||||
`${hard} regression${hard === 1 ? '' : 's'}`,
|
||||
`${soft} soft`,
|
||||
`${watch} notable`,
|
||||
`${imps} improvement${imps === 1 ? '' : 's'}`,
|
||||
`${stable} stable`,
|
||||
].join(', ');
|
||||
|
||||
return `▶ ${summary}. Pass rate ${aggDeltaText} vs master.`;
|
||||
}
|
||||
|
||||
function formatTerminalAggregate(
|
||||
evaluation: MultiRunEvaluation,
|
||||
comparison?: ComparisonResult,
|
||||
): string[] {
|
||||
const lines: string[] = [];
|
||||
if (!comparison) {
|
||||
const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
|
||||
const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
|
||||
const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
|
||||
const rate = total > 0 ? (passed / total) * 100 : 0;
|
||||
lines.push(
|
||||
TERMINAL_INDENT +
|
||||
`Aggregate: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`,
|
||||
);
|
||||
return lines;
|
||||
}
|
||||
|
||||
const { aggregate } = comparison;
|
||||
const baselineN = inferBaselineN(comparison);
|
||||
const aggDelta = aggregate.delta * 100;
|
||||
const sign = aggDelta >= 0 ? '+' : '';
|
||||
const arrow = aggDelta > 0 ? ' ↑' : aggDelta < 0 ? ' ↓' : '';
|
||||
lines.push(TERMINAL_INDENT + `Aggregate (${aggregate.intersectionSize} scenarios)`);
|
||||
lines.push(
|
||||
TERMINAL_INDENT +
|
||||
` PR ${pct(aggregate.prAggregatePassRate)}% (N=${evaluation.totalRuns})`,
|
||||
);
|
||||
if (baselineN !== undefined) {
|
||||
lines.push(
|
||||
TERMINAL_INDENT +
|
||||
` baseline ${pct(aggregate.baselineAggregatePassRate)}% (N=${baselineN})`,
|
||||
);
|
||||
} else {
|
||||
lines.push(TERMINAL_INDENT + ` baseline ${pct(aggregate.baselineAggregatePassRate)}%`);
|
||||
}
|
||||
lines.push(TERMINAL_INDENT + ` Δ ${sign}${aggDelta.toFixed(1)}pp${arrow}`);
|
||||
|
||||
if (comparison.baselineOnly.length > 0 || comparison.prOnly.length > 0) {
|
||||
const partialParts: string[] = [];
|
||||
if (comparison.baselineOnly.length > 0)
|
||||
partialParts.push(`${comparison.baselineOnly.length} baseline scenarios not run by PR`);
|
||||
if (comparison.prOnly.length > 0)
|
||||
partialParts.push(`${comparison.prOnly.length} PR scenarios have no baseline data`);
|
||||
lines.push(TERMINAL_INDENT + ` partial: ${partialParts.join(', ')}`);
|
||||
}
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
function formatTerminalPerTestCase(
|
||||
evaluation: MultiRunEvaluation,
|
||||
slugByTestCase?: Map<WorkflowTestCase, string>,
|
||||
): string[] {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
if (testCases.length === 0) return [];
|
||||
const lines: string[] = [];
|
||||
const heading = `Per-test-case results (${testCases.length})`;
|
||||
lines.push(TERMINAL_INDENT + heading);
|
||||
|
||||
const nameOf = (tc: TestCaseAggregation, max: number): string => {
|
||||
const slug = slugByTestCase?.get(tc.testCase);
|
||||
return slug ?? tc.testCase.prompt.slice(0, max);
|
||||
};
|
||||
|
||||
if (totalRuns > 1) {
|
||||
const rows = testCases.map((tc) => {
|
||||
const meanPassAtK =
|
||||
tc.scenarios.length > 0
|
||||
? Math.round(
|
||||
(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
|
||||
tc.scenarios.length) *
|
||||
100,
|
||||
)
|
||||
: 0;
|
||||
const meanPassHatK =
|
||||
tc.scenarios.length > 0
|
||||
? Math.round(
|
||||
(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
|
||||
tc.scenarios.length) *
|
||||
100,
|
||||
)
|
||||
: 0;
|
||||
return {
|
||||
name: nameOf(tc, 60),
|
||||
builds: `${tc.buildSuccessCount}/${totalRuns}`,
|
||||
passAtK: `${meanPassAtK}%`,
|
||||
passHatK: `${meanPassHatK}%`,
|
||||
};
|
||||
});
|
||||
const nameW = maxWidth(
|
||||
rows.map((r) => r.name),
|
||||
'workflow',
|
||||
);
|
||||
const buildsW = maxWidth(
|
||||
rows.map((r) => r.builds),
|
||||
'builds',
|
||||
);
|
||||
const atKHeader = `pass@${totalRuns}`;
|
||||
const hatKHeader = `pass^${totalRuns}`;
|
||||
const atKW = maxWidth(
|
||||
rows.map((r) => r.passAtK),
|
||||
atKHeader,
|
||||
);
|
||||
const hatKW = maxWidth(
|
||||
rows.map((r) => r.passHatK),
|
||||
hatKHeader,
|
||||
);
|
||||
lines.push(
|
||||
TERMINAL_TABLE_INDENT +
|
||||
`${'workflow'.padEnd(nameW)} ${'builds'.padEnd(buildsW)} ${atKHeader.padStart(atKW)} ${hatKHeader.padStart(hatKW)}`,
|
||||
);
|
||||
lines.push(
|
||||
TERMINAL_TABLE_INDENT +
|
||||
`${'─'.repeat(nameW)} ${'─'.repeat(buildsW)} ${'─'.repeat(atKW)} ${'─'.repeat(hatKW)}`,
|
||||
);
|
||||
for (const r of rows) {
|
||||
lines.push(
|
||||
TERMINAL_TABLE_INDENT +
|
||||
`${r.name.padEnd(nameW)} ${r.builds.padEnd(buildsW)} ${r.passAtK.padStart(atKW)} ${r.passHatK.padStart(hatKW)}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
for (const tc of testCases) {
|
||||
const r = tc.runs[0];
|
||||
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
|
||||
lines.push('');
|
||||
lines.push(TERMINAL_INDENT + `${nameOf(tc, 70)}…`);
|
||||
lines.push(TERMINAL_INDENT + ` ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
|
||||
if (r.buildError) lines.push(TERMINAL_INDENT + ` error: ${r.buildError.slice(0, 200)}`);
|
||||
for (const sa of tc.scenarios) {
|
||||
const sr = sa.runs[0];
|
||||
const status = sr.success ? 'PASS' : 'FAIL';
|
||||
const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
|
||||
lines.push(TERMINAL_INDENT + ` ${status} ${sr.scenario.name}${category}`);
|
||||
if (!sr.success) {
|
||||
const errs = sr.evalResult?.errors ?? [];
|
||||
if (errs.length > 0) {
|
||||
lines.push(TERMINAL_INDENT + ` error: ${errs.join('; ').slice(0, 200)}`);
|
||||
}
|
||||
lines.push(TERMINAL_INDENT + ` diagnosis: ${sr.reasoning.slice(0, 200)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lines.push('');
|
||||
return lines;
|
||||
}
|
||||
|
||||
function formatTerminalScenarioTable(scenarios: ScenarioComparison[], withPValue: boolean): string {
|
||||
const names = scenarios.map((s) => `${s.testCaseFile}/${s.scenarioName}`);
|
||||
const prCells = scenarios.map((s) => `${s.prPasses}/${s.prTotal}`);
|
||||
const baseCells = scenarios.map((s) => `${s.baselinePasses}/${s.baselineTotal}`);
|
||||
const deltaCells = scenarios.map((s) => {
|
||||
const d = s.delta * 100;
|
||||
const sign = d >= 0 ? '+' : '';
|
||||
const arrow = d > 0 ? ' ↑' : d < 0 ? ' ↓' : '';
|
||||
return `${sign}${d.toFixed(0)}pp${arrow}`;
|
||||
});
|
||||
const pCells = withPValue
|
||||
? scenarios.map((s) => (s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft).toFixed(3))
|
||||
: [];
|
||||
|
||||
const nameW = maxWidth(names, 'scenario');
|
||||
const prW = maxWidth(prCells, 'PR');
|
||||
const baseW = maxWidth(baseCells, 'baseline');
|
||||
const deltaW = maxWidth(deltaCells, 'Δ');
|
||||
const pW = withPValue ? maxWidth(pCells, 'p') : 0;
|
||||
|
||||
const headers = [
|
||||
'scenario'.padEnd(nameW),
|
||||
'PR'.padEnd(prW),
|
||||
'baseline'.padEnd(baseW),
|
||||
'Δ'.padEnd(deltaW),
|
||||
];
|
||||
if (withPValue) headers.push('p'.padEnd(pW));
|
||||
const widths = withPValue ? [nameW, prW, baseW, deltaW, pW] : [nameW, prW, baseW, deltaW];
|
||||
const sep = widths.map((w) => '─'.repeat(w)).join(' ');
|
||||
|
||||
const rows = scenarios.map((_, i) => {
|
||||
const cells = [
|
||||
names[i].padEnd(nameW),
|
||||
prCells[i].padEnd(prW),
|
||||
baseCells[i].padEnd(baseW),
|
||||
deltaCells[i].padEnd(deltaW),
|
||||
];
|
||||
if (withPValue) cells.push(pCells[i].padEnd(pW));
|
||||
return TERMINAL_TABLE_INDENT + cells.join(' ');
|
||||
});
|
||||
|
||||
return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
|
||||
'\n',
|
||||
);
|
||||
}
|
||||
|
||||
function formatTerminalCategoryTable(cats: FailureCategoryComparison[]): string {
|
||||
const names = cats.map((c) => {
|
||||
const isNew = c.baselineCount === 0 && c.prCount > 0;
|
||||
return c.category + (isNew ? ' 🆕' : '');
|
||||
});
|
||||
const prCells = cats.map((c) => `${c.prCount} (${pct(c.prRate)}%)`);
|
||||
const baseCells = cats.map((c) => `${c.baselineCount} (${pct(c.baselineRate)}%)`);
|
||||
const deltaCells = cats.map((c) => {
|
||||
const d = c.delta * 100;
|
||||
const sign = d >= 0 ? '+' : '';
|
||||
return `${sign}${d.toFixed(1)}pp`;
|
||||
});
|
||||
|
||||
const nameW = maxWidth(names, 'category');
|
||||
const prW = maxWidth(prCells, 'PR');
|
||||
const baseW = maxWidth(baseCells, 'baseline');
|
||||
|
||||
const headers = ['category'.padEnd(nameW), 'PR'.padEnd(prW), 'baseline'.padEnd(baseW), 'Δ'];
|
||||
const sep = [nameW, prW, baseW, maxWidth(deltaCells, 'Δ')].map((w) => '─'.repeat(w)).join(' ');
|
||||
|
||||
const rows = cats.map(
|
||||
(_, i) =>
|
||||
TERMINAL_TABLE_INDENT +
|
||||
[
|
||||
names[i].padEnd(nameW),
|
||||
prCells[i].padEnd(prW),
|
||||
baseCells[i].padEnd(baseW),
|
||||
deltaCells[i],
|
||||
].join(' '),
|
||||
);
|
||||
|
||||
return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
|
||||
'\n',
|
||||
);
|
||||
}
|
||||
|
||||
function maxWidth(values: string[], header: string): number {
|
||||
return values.reduce((m, v) => Math.max(m, v.length), header.length);
|
||||
}
|
||||
304
packages/@n8n/instance-ai/evaluations/comparison/statistics.ts
Normal file
304
packages/@n8n/instance-ai/evaluations/comparison/statistics.ts
Normal file
|
|
@ -0,0 +1,304 @@
|
|||
// ---------------------------------------------------------------------------
|
||||
// Decides whether one scenario's pass rate is meaningfully worse than
|
||||
// another, at the small sample sizes evals run at (N=3 typically).
|
||||
//
|
||||
// Public surface:
|
||||
// - classifyScenario(prPasses, prTotal, basePasses, baseTotal) — the verdict
|
||||
// - wilsonInterval(passes, total) — confidence band for a pass rate, used
|
||||
// for the headline aggregate
|
||||
//
|
||||
// The implementation uses Fisher's exact test and the Wilson score interval
|
||||
// under the hood; both are standard small-sample statistics. You don't need
|
||||
// to know either to use the public API.
|
||||
// ---------------------------------------------------------------------------
|
||||
import { strict as assert } from 'node:assert';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fisher's exact test (one-sided)
|
||||
//
|
||||
// Given a 2×2 table of pass/fail counts for PR vs baseline, returns the
|
||||
// probability of seeing a gap at least as bad as the observed one if the two
|
||||
// groups actually had the same pass rate. Small return value ⇒ strong
|
||||
// evidence the PR is worse.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const logFactorialCache: number[] = [0, 0];
|
||||
|
||||
function logFactorial(n: number): number {
|
||||
for (let i = logFactorialCache.length; i <= n; i++) {
|
||||
logFactorialCache.push(logFactorialCache[i - 1] + Math.log(i));
|
||||
}
|
||||
return logFactorialCache[n];
|
||||
}
|
||||
|
||||
function logBinomial(n: number, k: number): number {
|
||||
if (k < 0 || k > n) return -Infinity;
|
||||
return logFactorial(n) - logFactorial(k) - logFactorial(n - k);
|
||||
}
|
||||
|
||||
function hypergeomPmf(nPasses: number, nFails: number, nDrawn: number, k: number): number {
|
||||
const total = nPasses + nFails;
|
||||
if (k < Math.max(0, nDrawn - nFails) || k > Math.min(nDrawn, nPasses)) return 0;
|
||||
return Math.exp(
|
||||
logBinomial(nPasses, k) + logBinomial(nFails, nDrawn - k) - logBinomial(total, nDrawn),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* One-sided Fisher's exact test (left tail). Returns the probability that
|
||||
* PR's pass count would be at most `a` if PR and baseline shared the same
|
||||
* underlying pass rate. Small value ⇒ PR is significantly worse.
|
||||
*
|
||||
* 2×2 table:
|
||||
*
|
||||
* passed failed
|
||||
* PR | a | b |
|
||||
* Baseline | c | d |
|
||||
*
|
||||
* Returns 1 (no information) when either side has no trials, or when all
|
||||
* trials passed or all failed.
|
||||
*/
|
||||
export function fishersExactOneSidedLeft(a: number, b: number, c: number, d: number): number {
|
||||
const inputs = [a, b, c, d];
|
||||
for (const v of inputs) {
|
||||
assert(
|
||||
Number.isInteger(v) && v >= 0,
|
||||
'fishersExactOneSidedLeft requires non-negative integers',
|
||||
);
|
||||
}
|
||||
|
||||
const nPr = a + b;
|
||||
const nBase = c + d;
|
||||
const nPasses = a + c;
|
||||
const nFails = b + d;
|
||||
|
||||
if (nPr === 0 || nBase === 0) return 1;
|
||||
if (nPasses === 0 || nFails === 0) return 1;
|
||||
|
||||
let pValue = 0;
|
||||
const kMax = Math.min(a, nPasses);
|
||||
for (let k = 0; k <= kMax; k++) {
|
||||
pValue += hypergeomPmf(nPasses, nFails, nPr, k);
|
||||
}
|
||||
// Clamp to [0, 1] — accumulated FP error can push the sum slightly past 1.
|
||||
return Math.min(1, Math.max(0, pValue));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Wilson score interval (95% confidence)
|
||||
//
|
||||
// Returns a confidence band for a pass rate that behaves well at small N and
|
||||
// at extreme rates (close to 0 or 1) — both common in our evals. Used for
|
||||
// the headline aggregate band only; classification doesn't need it.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Standard z-score for a 95% confidence interval. We only ever use 95%, so
|
||||
// the value is inlined rather than parameterised.
|
||||
const Z_95 = 1.96;
|
||||
|
||||
export function wilsonInterval(passes: number, total: number): { lower: number; upper: number } {
|
||||
assert(
|
||||
Number.isInteger(passes) && passes >= 0,
|
||||
'wilsonInterval: passes must be a non-negative integer',
|
||||
);
|
||||
assert(
|
||||
Number.isInteger(total) && total >= 0,
|
||||
'wilsonInterval: total must be a non-negative integer',
|
||||
);
|
||||
assert(passes <= total, 'wilsonInterval: passes cannot exceed total');
|
||||
|
||||
if (total === 0) return { lower: 0, upper: 1 };
|
||||
|
||||
const p = passes / total;
|
||||
const z2 = Z_95 * Z_95;
|
||||
const denom = 1 + z2 / total;
|
||||
const center = (p + z2 / (2 * total)) / denom;
|
||||
const halfWidth = (Z_95 * Math.sqrt((p * (1 - p)) / total + z2 / (4 * total * total))) / denom;
|
||||
return {
|
||||
lower: Math.max(0, center - halfWidth),
|
||||
upper: Math.min(1, center + halfWidth),
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-scenario classification
|
||||
//
|
||||
// Three flag tiers, evaluated in order of strictness:
|
||||
//
|
||||
// hard_regression — high-confidence drop on a reliable baseline.
|
||||
// Gating-grade.
|
||||
// soft_regression — looser bar; investigate, not gating.
|
||||
// watch — moved noticeably but didn't pass either flag tier.
|
||||
// Pure visibility.
|
||||
//
|
||||
// Improvements use the hard tier (we don't surface borderline improvements;
|
||||
// they tend to be noise in the positive direction).
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export type ScenarioVerdict =
|
||||
| 'hard_regression' // PR is confidently worse, baseline was reliable
|
||||
| 'soft_regression' // looser bar — worth investigating, not high-confidence
|
||||
| 'watch' // moved enough to surface but no flag tier triggered
|
||||
| 'improvement' // PR is significantly better
|
||||
| 'stable' // no meaningful change
|
||||
| 'unreliable_baseline' // confident drop but baseline was too flaky to trust
|
||||
| 'insufficient_data'; // either side had zero trials
|
||||
|
||||
export interface ScenarioClassification {
|
||||
verdict: ScenarioVerdict;
|
||||
/** PR pass rate (0..1) */
|
||||
prPassRate: number;
|
||||
/** Baseline pass rate (0..1) */
|
||||
baselinePassRate: number;
|
||||
/** PR rate − baseline rate, signed. Negative = PR worse. */
|
||||
delta: number;
|
||||
/** Probability the PR is at least this much worse by chance. Lower ⇒ stronger regression evidence. */
|
||||
pValueLeft: number;
|
||||
/** Probability the PR is at least this much better by chance. */
|
||||
pValueRight: number;
|
||||
}
|
||||
|
||||
export interface TierThresholds {
|
||||
/** Flag only when the chance the gap happened by noise is below this. */
|
||||
maxPValue: number;
|
||||
/** Flag only when the absolute pass-rate gap is at least this large (0..1). */
|
||||
minDelta: number;
|
||||
/** Flag only when the baseline pass rate was at least this high (0..1). */
|
||||
minBaselinePassRate: number;
|
||||
}
|
||||
|
||||
export interface ClassifyOptions {
|
||||
/** Hard-flag thresholds (most strict). Defaults: maxPValue=0.05, minDelta=0.30, minBaselinePassRate=0.70. */
|
||||
hard?: Partial<TierThresholds>;
|
||||
/** Soft-flag thresholds (looser). Defaults: maxPValue=0.20, minDelta=0.15, minBaselinePassRate=0.50. */
|
||||
soft?: Partial<TierThresholds>;
|
||||
/** Absolute pass-rate change required for a "watch" verdict regardless of significance. Default 0.35. */
|
||||
watchDelta?: number;
|
||||
}
|
||||
|
||||
const DEFAULT_HARD: TierThresholds = {
|
||||
maxPValue: 0.05,
|
||||
minDelta: 0.3,
|
||||
minBaselinePassRate: 0.7,
|
||||
};
|
||||
const DEFAULT_SOFT: TierThresholds = {
|
||||
maxPValue: 0.2,
|
||||
minDelta: 0.15,
|
||||
minBaselinePassRate: 0.5,
|
||||
};
|
||||
// Watch threshold: surface scenarios whose pass rate changed by at least 35pp
|
||||
// without reaching a flag tier. High enough that natural noise on rock-solid
|
||||
// scenarios (e.g. 2/3 vs 10/10 = −33pp) doesn't crowd the comment.
|
||||
const DEFAULT_WATCH_DELTA = 0.35;
|
||||
|
||||
function meetsThreshold(
|
||||
pValue: number,
|
||||
delta: number,
|
||||
baselineRate: number,
|
||||
tier: TierThresholds,
|
||||
direction: 'worse' | 'better',
|
||||
): boolean {
|
||||
if (pValue >= tier.maxPValue) return false;
|
||||
if (direction === 'worse') {
|
||||
if (delta > -tier.minDelta) return false;
|
||||
if (baselineRate < tier.minBaselinePassRate) return false;
|
||||
} else {
|
||||
if (delta < tier.minDelta) return false;
|
||||
// Improvements skip the reliability gate — fixing flaky scenarios is a real win.
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify a single scenario into one of seven verdicts. See ScenarioVerdict
|
||||
* for the tier semantics.
|
||||
*
|
||||
* `options` exists for tests; production callers leave thresholds at defaults.
|
||||
*/
|
||||
export function classifyScenario(
|
||||
prPasses: number,
|
||||
prTotal: number,
|
||||
baselinePasses: number,
|
||||
baselineTotal: number,
|
||||
options: ClassifyOptions = {},
|
||||
): ScenarioClassification {
|
||||
const hard: TierThresholds = { ...DEFAULT_HARD, ...options.hard };
|
||||
const soft: TierThresholds = { ...DEFAULT_SOFT, ...options.soft };
|
||||
const watchDelta = options.watchDelta ?? DEFAULT_WATCH_DELTA;
|
||||
|
||||
const prPassRate = prTotal > 0 ? prPasses / prTotal : 0;
|
||||
const baselinePassRate = baselineTotal > 0 ? baselinePasses / baselineTotal : 0;
|
||||
|
||||
if (prTotal === 0 || baselineTotal === 0) {
|
||||
return {
|
||||
verdict: 'insufficient_data',
|
||||
prPassRate,
|
||||
baselinePassRate,
|
||||
delta: prPassRate - baselinePassRate,
|
||||
pValueLeft: 1,
|
||||
pValueRight: 1,
|
||||
};
|
||||
}
|
||||
|
||||
const a = prPasses;
|
||||
const b = prTotal - prPasses;
|
||||
const c = baselinePasses;
|
||||
const d = baselineTotal - baselinePasses;
|
||||
|
||||
const pValueLeft = fishersExactOneSidedLeft(a, b, c, d);
|
||||
const pValueRight = fishersExactOneSidedLeft(c, d, a, b);
|
||||
const delta = prPassRate - baselinePassRate;
|
||||
|
||||
// Improvement (right tail) — single tier, hard thresholds only
|
||||
if (meetsThreshold(pValueRight, delta, baselinePassRate, hard, 'better')) {
|
||||
return { verdict: 'improvement', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
|
||||
}
|
||||
|
||||
// Hard regression — passes all three hard gates
|
||||
if (meetsThreshold(pValueLeft, delta, baselinePassRate, hard, 'worse')) {
|
||||
return {
|
||||
verdict: 'hard_regression',
|
||||
prPassRate,
|
||||
baselinePassRate,
|
||||
delta,
|
||||
pValueLeft,
|
||||
pValueRight,
|
||||
};
|
||||
}
|
||||
|
||||
// Confident drop, but on a baseline too flaky to call a regression.
|
||||
// Surface as `unreliable_baseline` so it's visible without being a flag.
|
||||
if (
|
||||
pValueLeft < hard.maxPValue &&
|
||||
delta <= -hard.minDelta &&
|
||||
baselinePassRate < hard.minBaselinePassRate
|
||||
) {
|
||||
return {
|
||||
verdict: 'unreliable_baseline',
|
||||
prPassRate,
|
||||
baselinePassRate,
|
||||
delta,
|
||||
pValueLeft,
|
||||
pValueRight,
|
||||
};
|
||||
}
|
||||
|
||||
// Soft regression — passes the looser gates
|
||||
if (meetsThreshold(pValueLeft, delta, baselinePassRate, soft, 'worse')) {
|
||||
return {
|
||||
verdict: 'soft_regression',
|
||||
prPassRate,
|
||||
baselinePassRate,
|
||||
delta,
|
||||
pValueLeft,
|
||||
pValueRight,
|
||||
};
|
||||
}
|
||||
|
||||
// Watch — meaningful movement but no flag fired. Pure visibility.
|
||||
if (Math.abs(delta) >= watchDelta) {
|
||||
return { verdict: 'watch', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
|
||||
}
|
||||
|
||||
return { verdict: 'stable', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
|
||||
}
|
||||
|
|
@ -28,7 +28,7 @@ import type {
|
|||
// Constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 600_000;
|
||||
const DEFAULT_TIMEOUT_MS = 900_000;
|
||||
const SSE_SETTLE_DELAY_MS = 200;
|
||||
const POLL_INTERVAL_MS = 500;
|
||||
const BACKGROUND_TASK_POLL_INTERVAL_MS = 2_000;
|
||||
|
|
|
|||
|
|
@ -39,3 +39,38 @@ export type {
|
|||
ChecklistItem,
|
||||
ChecklistResult,
|
||||
} from './types';
|
||||
|
||||
// -- Comparison (regression detection) --
|
||||
export {
|
||||
compareBuckets,
|
||||
byVerdict,
|
||||
improvements,
|
||||
hardRegressions,
|
||||
softRegressions,
|
||||
watchList,
|
||||
} from './comparison/compare';
|
||||
export type {
|
||||
ComparisonResult,
|
||||
ScenarioComparison,
|
||||
ScenarioCounts,
|
||||
ExperimentBucket,
|
||||
AggregateComparison,
|
||||
FailureCategoryComparison,
|
||||
} from './comparison/compare';
|
||||
export {
|
||||
classifyScenario,
|
||||
fishersExactOneSidedLeft,
|
||||
wilsonInterval,
|
||||
} from './comparison/statistics';
|
||||
export type {
|
||||
ScenarioVerdict,
|
||||
ScenarioClassification,
|
||||
ClassifyOptions,
|
||||
TierThresholds,
|
||||
} from './comparison/statistics';
|
||||
export { formatComparisonMarkdown, formatComparisonTerminal } from './comparison/format';
|
||||
export {
|
||||
fetchBaselineBucket,
|
||||
findLatestBaseline,
|
||||
BASELINE_EXPERIMENT_PREFIX,
|
||||
} from './comparison/fetch-baseline';
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user