n8n/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts
José Braulio González Valido bbe3e2d148
feat(ai-builder): Add per-PR eval regression detection vs LangSmith baseline (#29456)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 08:15:08 +00:00

305 lines
10 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ---------------------------------------------------------------------------
// Decides whether one scenario's pass rate is meaningfully worse than
// another, at the small sample sizes evals run at (N=3 typically).
//
// Public surface:
// - classifyScenario(prPasses, prTotal, basePasses, baseTotal) — the verdict
// - wilsonInterval(passes, total) — confidence band for a pass rate, used
// for the headline aggregate
//
// The implementation uses Fisher's exact test and the Wilson score interval
// under the hood; both are standard small-sample statistics. You don't need
// to know either to use the public API.
// ---------------------------------------------------------------------------
import { strict as assert } from 'node:assert';
// ---------------------------------------------------------------------------
// Fisher's exact test (one-sided)
//
// Given a 2×2 table of pass/fail counts for PR vs baseline, returns the
// probability of seeing a gap at least as bad as the observed one if the two
// groups actually had the same pass rate. Small return value ⇒ strong
// evidence the PR is worse.
// ---------------------------------------------------------------------------
const logFactorialCache: number[] = [0, 0];
function logFactorial(n: number): number {
for (let i = logFactorialCache.length; i <= n; i++) {
logFactorialCache.push(logFactorialCache[i - 1] + Math.log(i));
}
return logFactorialCache[n];
}
function logBinomial(n: number, k: number): number {
if (k < 0 || k > n) return -Infinity;
return logFactorial(n) - logFactorial(k) - logFactorial(n - k);
}
function hypergeomPmf(nPasses: number, nFails: number, nDrawn: number, k: number): number {
const total = nPasses + nFails;
if (k < Math.max(0, nDrawn - nFails) || k > Math.min(nDrawn, nPasses)) return 0;
return Math.exp(
logBinomial(nPasses, k) + logBinomial(nFails, nDrawn - k) - logBinomial(total, nDrawn),
);
}
/**
* One-sided Fisher's exact test (left tail). Returns the probability that
* PR's pass count would be at most `a` if PR and baseline shared the same
* underlying pass rate. Small value ⇒ PR is significantly worse.
*
* 2×2 table:
*
* passed failed
* PR | a | b |
* Baseline | c | d |
*
* Returns 1 (no information) when either side has no trials, or when all
* trials passed or all failed.
*/
export function fishersExactOneSidedLeft(a: number, b: number, c: number, d: number): number {
const inputs = [a, b, c, d];
for (const v of inputs) {
assert(
Number.isInteger(v) && v >= 0,
'fishersExactOneSidedLeft requires non-negative integers',
);
}
const nPr = a + b;
const nBase = c + d;
const nPasses = a + c;
const nFails = b + d;
if (nPr === 0 || nBase === 0) return 1;
if (nPasses === 0 || nFails === 0) return 1;
let pValue = 0;
const kMax = Math.min(a, nPasses);
for (let k = 0; k <= kMax; k++) {
pValue += hypergeomPmf(nPasses, nFails, nPr, k);
}
// Clamp to [0, 1] — accumulated FP error can push the sum slightly past 1.
return Math.min(1, Math.max(0, pValue));
}
// ---------------------------------------------------------------------------
// Wilson score interval (95% confidence)
//
// Returns a confidence band for a pass rate that behaves well at small N and
// at extreme rates (close to 0 or 1) — both common in our evals. Used for
// the headline aggregate band only; classification doesn't need it.
// ---------------------------------------------------------------------------
// Standard z-score for a 95% confidence interval. We only ever use 95%, so
// the value is inlined rather than parameterised.
const Z_95 = 1.96;
export function wilsonInterval(passes: number, total: number): { lower: number; upper: number } {
assert(
Number.isInteger(passes) && passes >= 0,
'wilsonInterval: passes must be a non-negative integer',
);
assert(
Number.isInteger(total) && total >= 0,
'wilsonInterval: total must be a non-negative integer',
);
assert(passes <= total, 'wilsonInterval: passes cannot exceed total');
if (total === 0) return { lower: 0, upper: 1 };
const p = passes / total;
const z2 = Z_95 * Z_95;
const denom = 1 + z2 / total;
const center = (p + z2 / (2 * total)) / denom;
const halfWidth = (Z_95 * Math.sqrt((p * (1 - p)) / total + z2 / (4 * total * total))) / denom;
return {
lower: Math.max(0, center - halfWidth),
upper: Math.min(1, center + halfWidth),
};
}
// ---------------------------------------------------------------------------
// Per-scenario classification
//
// Three flag tiers, evaluated in order of strictness:
//
// hard_regression — high-confidence drop on a reliable baseline.
// Gating-grade.
// soft_regression — looser bar; investigate, not gating.
// watch — moved noticeably but didn't pass either flag tier.
// Pure visibility.
//
// Improvements use the hard tier (we don't surface borderline improvements;
// they tend to be noise in the positive direction).
// ---------------------------------------------------------------------------
export type ScenarioVerdict =
| 'hard_regression' // PR is confidently worse, baseline was reliable
| 'soft_regression' // looser bar — worth investigating, not high-confidence
| 'watch' // moved enough to surface but no flag tier triggered
| 'improvement' // PR is significantly better
| 'stable' // no meaningful change
| 'unreliable_baseline' // confident drop but baseline was too flaky to trust
| 'insufficient_data'; // either side had zero trials
export interface ScenarioClassification {
verdict: ScenarioVerdict;
/** PR pass rate (0..1) */
prPassRate: number;
/** Baseline pass rate (0..1) */
baselinePassRate: number;
/** PR rate baseline rate, signed. Negative = PR worse. */
delta: number;
/** Probability the PR is at least this much worse by chance. Lower ⇒ stronger regression evidence. */
pValueLeft: number;
/** Probability the PR is at least this much better by chance. */
pValueRight: number;
}
export interface TierThresholds {
/** Flag only when the chance the gap happened by noise is below this. */
maxPValue: number;
/** Flag only when the absolute pass-rate gap is at least this large (0..1). */
minDelta: number;
/** Flag only when the baseline pass rate was at least this high (0..1). */
minBaselinePassRate: number;
}
export interface ClassifyOptions {
/** Hard-flag thresholds (most strict). Defaults: maxPValue=0.05, minDelta=0.30, minBaselinePassRate=0.70. */
hard?: Partial<TierThresholds>;
/** Soft-flag thresholds (looser). Defaults: maxPValue=0.20, minDelta=0.15, minBaselinePassRate=0.50. */
soft?: Partial<TierThresholds>;
/** Absolute pass-rate change required for a "watch" verdict regardless of significance. Default 0.35. */
watchDelta?: number;
}
const DEFAULT_HARD: TierThresholds = {
maxPValue: 0.05,
minDelta: 0.3,
minBaselinePassRate: 0.7,
};
const DEFAULT_SOFT: TierThresholds = {
maxPValue: 0.2,
minDelta: 0.15,
minBaselinePassRate: 0.5,
};
// Watch threshold: surface scenarios whose pass rate changed by at least 35pp
// without reaching a flag tier. High enough that natural noise on rock-solid
// scenarios (e.g. 2/3 vs 10/10 = 33pp) doesn't crowd the comment.
const DEFAULT_WATCH_DELTA = 0.35;
function meetsThreshold(
pValue: number,
delta: number,
baselineRate: number,
tier: TierThresholds,
direction: 'worse' | 'better',
): boolean {
if (pValue >= tier.maxPValue) return false;
if (direction === 'worse') {
if (delta > -tier.minDelta) return false;
if (baselineRate < tier.minBaselinePassRate) return false;
} else {
if (delta < tier.minDelta) return false;
// Improvements skip the reliability gate — fixing flaky scenarios is a real win.
}
return true;
}
/**
* Classify a single scenario into one of seven verdicts. See ScenarioVerdict
* for the tier semantics.
*
* `options` exists for tests; production callers leave thresholds at defaults.
*/
export function classifyScenario(
prPasses: number,
prTotal: number,
baselinePasses: number,
baselineTotal: number,
options: ClassifyOptions = {},
): ScenarioClassification {
const hard: TierThresholds = { ...DEFAULT_HARD, ...options.hard };
const soft: TierThresholds = { ...DEFAULT_SOFT, ...options.soft };
const watchDelta = options.watchDelta ?? DEFAULT_WATCH_DELTA;
const prPassRate = prTotal > 0 ? prPasses / prTotal : 0;
const baselinePassRate = baselineTotal > 0 ? baselinePasses / baselineTotal : 0;
if (prTotal === 0 || baselineTotal === 0) {
return {
verdict: 'insufficient_data',
prPassRate,
baselinePassRate,
delta: prPassRate - baselinePassRate,
pValueLeft: 1,
pValueRight: 1,
};
}
const a = prPasses;
const b = prTotal - prPasses;
const c = baselinePasses;
const d = baselineTotal - baselinePasses;
const pValueLeft = fishersExactOneSidedLeft(a, b, c, d);
const pValueRight = fishersExactOneSidedLeft(c, d, a, b);
const delta = prPassRate - baselinePassRate;
// Improvement (right tail) — single tier, hard thresholds only
if (meetsThreshold(pValueRight, delta, baselinePassRate, hard, 'better')) {
return { verdict: 'improvement', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
}
// Hard regression — passes all three hard gates
if (meetsThreshold(pValueLeft, delta, baselinePassRate, hard, 'worse')) {
return {
verdict: 'hard_regression',
prPassRate,
baselinePassRate,
delta,
pValueLeft,
pValueRight,
};
}
// Confident drop, but on a baseline too flaky to call a regression.
// Surface as `unreliable_baseline` so it's visible without being a flag.
if (
pValueLeft < hard.maxPValue &&
delta <= -hard.minDelta &&
baselinePassRate < hard.minBaselinePassRate
) {
return {
verdict: 'unreliable_baseline',
prPassRate,
baselinePassRate,
delta,
pValueLeft,
pValueRight,
};
}
// Soft regression — passes the looser gates
if (meetsThreshold(pValueLeft, delta, baselinePassRate, soft, 'worse')) {
return {
verdict: 'soft_regression',
prPassRate,
baselinePassRate,
delta,
pValueLeft,
pValueRight,
};
}
// Watch — meaningful movement but no flag fired. Pure visibility.
if (Math.abs(delta) >= watchDelta) {
return { verdict: 'watch', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
}
return { verdict: 'stable', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
}