n8n/packages/@n8n/instance-ai/evaluations/comparison/compare.ts
José Braulio González Valido bbe3e2d148
feat(ai-builder): Add per-PR eval regression detection vs LangSmith baseline (#29456)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 08:15:08 +00:00

334 lines
11 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ---------------------------------------------------------------------------
// Comparison core: take two experiment buckets, return a ComparisonResult.
//
// Pure function, no I/O. The tier thresholds (p-value cutoff, minimum delta,
// minimum baseline pass rate) live in statistics.ts — there's no CLI knob.
// Tune them there if the false-positive rate drifts.
// ---------------------------------------------------------------------------
import {
classifyScenario,
wilsonInterval,
type ClassifyOptions,
type ScenarioClassification,
type ScenarioVerdict,
} from './statistics';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface ScenarioCounts {
testCaseFile: string;
scenarioName: string;
passed: number;
total: number;
failureCategories?: Record<string, number>;
}
export interface ExperimentBucket {
experimentName: string;
scenarios: Map<string, ScenarioCounts>;
/**
* Aggregated failure-category counts across all trials in all scenarios.
* Used for the run-level failure-category drift table — orthogonal to
* per-scenario verdicts.
*/
failureCategoryTotals?: Record<string, number>;
trialTotal?: number;
}
export interface ScenarioComparison extends ScenarioClassification {
testCaseFile: string;
scenarioName: string;
prPasses: number;
prTotal: number;
baselinePasses: number;
baselineTotal: number;
}
export interface AggregateComparison {
intersectionSize: number;
prAggregatePassRate: number;
baselineAggregatePassRate: number;
prAggregateCI: { lower: number; upper: number };
baselineAggregateCI: { lower: number; upper: number };
delta: number;
}
export interface FailureCategoryComparison {
category: string;
prCount: number;
prRate: number; // count / trialTotal
baselineCount: number;
baselineRate: number;
delta: number; // prRate baselineRate
notable: boolean;
}
export interface ComparisonResult {
pr: { experimentName: string };
baseline: { experimentName: string };
aggregate: AggregateComparison;
scenarios: ScenarioComparison[];
prOnly: Array<{ testCaseFile: string; scenarioName: string }>;
baselineOnly: Array<{ testCaseFile: string; scenarioName: string }>;
failureCategories: FailureCategoryComparison[];
}
/**
* Result of a comparison attempt. The `kind` field distinguishes between
* "ran successfully", "skipped intentionally" (no baseline yet, current run
* IS the baseline), and "failed unexpectedly" (LangSmith API error, fetch
* timeout, etc.). The PR comment renders a different alert per kind so
* readers can tell a missing baseline from a regression-detection outage.
*/
export type ComparisonOutcome =
| { kind: 'ok'; result: ComparisonResult }
| { kind: 'no_baseline' }
| { kind: 'self_baseline'; experimentName: string }
| { kind: 'fetch_failed'; error: string };
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/** Hard regressions only — high-confidence, gating-grade flags. */
export function hardRegressions(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'hard_regression');
}
/** Soft regressions — looser thresholds, worth investigating but not gating. */
export function softRegressions(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'soft_regression');
}
/** Movement ≥ watchDelta without reaching a flag tier. Visibility only. */
export function watchList(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'watch');
}
export function improvements(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'improvement');
}
export function byVerdict(result: ComparisonResult): Record<ScenarioVerdict, number> {
const counts: Record<ScenarioVerdict, number> = {
hard_regression: 0,
soft_regression: 0,
watch: 0,
improvement: 0,
stable: 0,
unreliable_baseline: 0,
insufficient_data: 0,
};
for (const s of result.scenarios) counts[s.verdict]++;
return counts;
}
// ---------------------------------------------------------------------------
// Compare
// ---------------------------------------------------------------------------
/**
* Compare two experiment buckets and produce a structured comparison result.
*
* Aggregate is computed over the *intersection* of scenarios — the only
* scenarios for which the rates are directly comparable. PR-only and
* baseline-only scenarios are surfaced separately, not folded into the
* aggregate.
*
* Aggregate pass rate is the *micro* average — total passes / total trials
* across the intersection.
*
* `options` exists for tests; production callers pass nothing.
*/
export function compareBuckets(
pr: ExperimentBucket,
baseline: ExperimentBucket,
options: ClassifyOptions = {},
): ComparisonResult {
const scenarios: ScenarioComparison[] = [];
const prOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
const baselineOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
let prIPasses = 0;
let prITotal = 0;
let baseIPasses = 0;
let baseITotal = 0;
for (const [key, prCounts] of pr.scenarios) {
const baseCounts = baseline.scenarios.get(key);
if (!baseCounts) {
prOnly.push({
testCaseFile: prCounts.testCaseFile,
scenarioName: prCounts.scenarioName,
});
continue;
}
prIPasses += prCounts.passed;
prITotal += prCounts.total;
baseIPasses += baseCounts.passed;
baseITotal += baseCounts.total;
const classification = classifyScenario(
prCounts.passed,
prCounts.total,
baseCounts.passed,
baseCounts.total,
options,
);
scenarios.push({
testCaseFile: prCounts.testCaseFile,
scenarioName: prCounts.scenarioName,
prPasses: prCounts.passed,
prTotal: prCounts.total,
baselinePasses: baseCounts.passed,
baselineTotal: baseCounts.total,
...classification,
});
}
for (const [key, baseCounts] of baseline.scenarios) {
if (!pr.scenarios.has(key)) {
baselineOnly.push({
testCaseFile: baseCounts.testCaseFile,
scenarioName: baseCounts.scenarioName,
});
}
}
const aggregate: AggregateComparison = {
intersectionSize: scenarios.length,
prAggregatePassRate: rate(prIPasses, prITotal),
baselineAggregatePassRate: rate(baseIPasses, baseITotal),
prAggregateCI: wilsonInterval(prIPasses, prITotal),
baselineAggregateCI: wilsonInterval(baseIPasses, baseITotal),
delta: rate(prIPasses, prITotal) - rate(baseIPasses, baseITotal),
};
scenarios.sort(scenarioComparator);
const failureCategories = compareFailureCategories(pr, baseline);
return {
pr: { experimentName: pr.experimentName },
baseline: { experimentName: baseline.experimentName },
aggregate,
scenarios,
prOnly,
baselineOnly,
failureCategories,
};
}
// ---------------------------------------------------------------------------
// Failure-category drift
// ---------------------------------------------------------------------------
/** Min absolute rate gap to consider a category notable (5 percentage points). */
const CATEGORY_NOTABLE_RATE_DELTA = 0.05;
/** Min absolute trial-count gap (over scaling) required alongside the rate gap. */
const CATEGORY_NOTABLE_COUNT_DELTA = 3;
/**
* Categories the verifier is supposed to emit. Anything else (malformed
* strings like `-`, `>builder_issue`, empty, etc.) is dropped from the
* comparison so the PR comment doesn't display verifier noise. Keep in sync
* with the verifier's category enum; unknown values are logged at verbose
* level via the console (see compareFailureCategories).
*/
const KNOWN_FAILURE_CATEGORIES = new Set([
'builder_issue',
'mock_issue',
'framework_issue',
'verification_failure',
'build_failure',
]);
function isCategoryNotable(
prCount: number,
prTotal: number,
baselineCount: number,
baselineTotal: number,
): boolean {
const rateGap = Math.abs(prCount / prTotal - baselineCount / baselineTotal);
if (rateGap < CATEGORY_NOTABLE_RATE_DELTA) return false;
const expectedPrCount = baselineCount * (prTotal / baselineTotal);
const countGap = Math.abs(prCount - expectedPrCount);
return countGap >= CATEGORY_NOTABLE_COUNT_DELTA;
}
function compareFailureCategories(
pr: ExperimentBucket,
baseline: ExperimentBucket,
): FailureCategoryComparison[] {
if (!pr.failureCategoryTotals || !baseline.failureCategoryTotals) return [];
const prTotal = pr.trialTotal ?? 0;
const baseTotal = baseline.trialTotal ?? 0;
if (prTotal === 0 || baseTotal === 0) return [];
// Surface unrecognised values so we notice when the verifier adds a new
// category (or starts emitting noise we should clean up). Doesn't enter
// the comparison output; the renderer only knows about KNOWN_FAILURE_CATEGORIES.
for (const category of Object.keys(pr.failureCategoryTotals)) {
if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
}
}
for (const category of Object.keys(baseline.failureCategoryTotals)) {
if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
}
}
// Always emit a row for every known category, even if both sides are 0.
// The renderer can decide whether to suppress 0/0 rows; this gives readers
// a complete picture of the failure-type taxonomy by default.
const out: FailureCategoryComparison[] = [];
for (const category of KNOWN_FAILURE_CATEGORIES) {
const prCount = pr.failureCategoryTotals[category] ?? 0;
const baselineCount = baseline.failureCategoryTotals[category] ?? 0;
out.push({
category,
prCount,
prRate: prCount / prTotal,
baselineCount,
baselineRate: baselineCount / baseTotal,
delta: prCount / prTotal - baselineCount / baseTotal,
notable: isCategoryNotable(prCount, prTotal, baselineCount, baseTotal),
});
}
// Sort: notable first, then by absolute delta descending.
out.sort((a, b) => {
if (a.notable !== b.notable) return a.notable ? -1 : 1;
return Math.abs(b.delta) - Math.abs(a.delta);
});
return out;
}
function rate(passes: number, total: number): number {
return total > 0 ? passes / total : 0;
}
const VERDICT_ORDER: Record<ScenarioComparison['verdict'], number> = {
hard_regression: 0,
soft_regression: 1,
improvement: 2,
watch: 3,
unreliable_baseline: 4,
stable: 5,
insufficient_data: 6,
};
function scenarioComparator(a: ScenarioComparison, b: ScenarioComparison): number {
const av = VERDICT_ORDER[a.verdict];
const bv = VERDICT_ORDER[b.verdict];
if (av !== bv) return av - bv;
const fileCmp = a.testCaseFile.localeCompare(b.testCaseFile);
if (fileCmp !== 0) return fileCmp;
return a.scenarioName.localeCompare(b.scenarioName);
}