feat(ai-builder): Add per-PR eval regression detection vs LangSmith baseline (#29456)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
José Braulio González Valido 2026-05-06 09:15:08 +01:00 committed by GitHub
parent 5b01cba8b2
commit bbe3e2d148
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 2818 additions and 83 deletions

View File

@ -143,7 +143,7 @@ jobs:
--base-url "$BASE_URLS" \
--concurrency 32 \
--verbose \
--iterations 3 \
--iterations 5 \
${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }}
- name: Stop n8n containers
@ -160,22 +160,16 @@ jobs:
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
RESULTS_FILE="packages/@n8n/instance-ai/eval-results.json"
if [ ! -f "$RESULTS_FILE" ]; then
echo "No eval results file found"
# The eval CLI writes the full PR comment as eval-pr-comment.md
# (see comparison/format.ts:formatComparisonMarkdown). It includes
# the alert, aggregate, comparison sections, per-test-case results
# collapsed, and failure details collapsed. CI just relays it.
COMMENT_FILE="packages/@n8n/instance-ai/eval-pr-comment.md"
if [ ! -f "$COMMENT_FILE" ]; then
echo "No PR comment file found (eval likely cancelled before writing results)"
exit 0
fi
# Build the full comment body with jq
jq -r '
"### Instance AI Workflow Eval Results\n\n" +
"**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)% | iterations: \(.summary.passRatePerIter)**\n\n" +
"| Workflow | Build | pass@\(.totalRuns) | pass^\(.totalRuns) |\n|---|---|---|---|\n" +
([.testCases[] as $tc | "| \($tc.name) | \($tc.buildSuccessCount)/\($tc.totalRuns) | \(([$tc.scenarios[] | .passAtK] | add) / ($tc.scenarios | length) * 100 | floor)% | \(([$tc.scenarios[] | .passHatK] | add) / ($tc.scenarios | length) * 100 | floor)% |"] | join("\n")) +
"\n\n<details><summary>Failure details</summary>\n\n" +
([.testCases[] as $tc | $tc.scenarios[] | select(.passHatK < 1) | "**\($tc.name) / \(.name)** — \(.passCount)/\(.totalRuns) passed" + "\n" + ([.runs[] | select(.passed == false) | "> Run\(if .failureCategory then " [\(.failureCategory)]" else "" end): \(.reasoning | .[0:200])"] | join("\n"))] | join("\n\n")) +
"\n</details>"
' "$RESULTS_FILE" > /tmp/eval-comment.md
cp "$COMMENT_FILE" /tmp/eval-comment.md
# Find and update existing eval comment, or create new one
COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \

1
.gitignore vendored
View File

@ -36,6 +36,7 @@ packages/testing/playwright/playwright-report
packages/testing/playwright/test-results
packages/testing/playwright/eval-results.json
packages/@n8n/instance-ai/eval-results.json
packages/@n8n/instance-ai/eval-pr-comment.md
packages/testing/playwright/.playwright-browsers
packages/testing/playwright/.playwright-cli
test-results/

View File

@ -121,7 +121,7 @@ dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --iterations 3
| `--base-url` | `http://localhost:5678` | n8n instance URL |
| `--email` | E2E test owner | Override login email (or `N8N_EVAL_EMAIL`) |
| `--password` | E2E test owner | Override login password (or `N8N_EVAL_PASSWORD`) |
| `--timeout-ms` | `600000` | Per-test-case timeout |
| `--timeout-ms` | `900000` | Per-test-case timeout |
| `--output-dir` | cwd | Where to write `eval-results.json` |
| `--dataset` | `instance-ai-workflow-evals` | LangSmith dataset name |
| `--concurrency` | `16` | Max concurrent scenarios (builds are separately capped at 4) |
@ -155,6 +155,47 @@ Every run produces:
**LangSmith caveat:** if `LANGSMITH_API_KEY` is set in `.env.local`, local runs also land in the shared `instance-ai-workflow-evals` dataset. Unset it (or run without `dotenvx`) to keep exploratory runs out of team results.
## Regression detection
When `LANGSMITH_API_KEY` is set, every eval run automatically compares its results against the most recent pinned baseline (any experiment whose name starts with `instance-ai-baseline-`). Two output files are written:
- `eval-results.json` — structured data only, including `comparison.result` when a baseline was found.
- `eval-pr-comment.md` — the full PR comment rendered as markdown, including the alert, aggregate, comparison sections, per-test-case results, and failure details. Always written; falls back to a no-baseline summary when no comparison ran.
The CI PR-comment step uses `eval-pr-comment.md` as the entire comment body (no jq assembly in the workflow). The console output uses a separate aligned-text formatter — same data, no markdown noise in the terminal.
### Refreshing the baseline
There is no auto-refresh — refresh explicitly when you want a new reference point, ideally with high N for low noise:
```bash
# From packages/@n8n/instance-ai/, on master at the version you want to pin
LANGSMITH_API_KEY=... dotenvx run -f ../../../.env.local -- \
pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
```
LangSmith appends a random suffix (e.g. `instance-ai-baseline-7abc1234`); the most recently started one becomes the comparison target on the next eval run. The comparison is silently skipped on the baseline-creation run itself.
### How scenarios are tiered
Each scenario lands in one of three regression tiers, evaluated in order of strictness:
- **Regression** — high-confidence flag, gating-grade. The drop must be statistically significant (chance of seeing it by noise < 5%), at least 30 percentage points in size, and the baseline must have been reliable (≥ 70% pass rate).
- **Soft regression** — looser bar for visibility on borderline cases. Looser confidence threshold (chance by noise < 20%), drop 15 percentage points, baseline 50%. Frequently natural variance worth a glance only if your changes touch related code paths.
- **Notable movement** — any scenario whose pass rate moved by ≥ 35 percentage points without reaching either flag tier. Pure visibility, no implication of cause.
Other verdicts: `improvement` (PR significantly better, skips the reliability gate), `unreliable_baseline` (confident drop but baseline was too flaky to call a regression — surfaced but not flagged), `stable`, `insufficient_data`.
Why these tiers and not a flat percentage threshold? At the small N PR runs use (typically 3 iterations), a flat threshold can't tell a real regression from coin-flip noise. The confidence cutoff filters out gaps that could plausibly happen by chance, and the reliability gate avoids chasing noise on already-flaky scenarios. Implementation lives in `comparison/statistics.ts` (Fisher's exact test for the confidence check, Wilson interval for the headline aggregate band). Tune the soft tier first if the false-positive rate looks off — keep the hard tier strict.
### Failure-category drift
When both sides captured per-trial `failureCategory` values, the comparison also surfaces a run-level table of category rates (PR vs baseline). A category is marked **notable** when its absolute rate delta is ≥ 5 percentage points _and_ the count change beyond what scenario-count scaling would predict is ≥ 3 trials. This catches cross-scenario shifts (e.g. mock-generation breaking, or a model getting weaker overall) that per-scenario flags can miss.
### Best-effort
Comparison is logged and skipped on any LangSmith failure — it never fails the eval. It is also skipped when no baseline experiment exists yet.
## Pairwise evals
Pairwise evals score a built workflow against the dataset's `dos` / `donts`

View File

@ -0,0 +1,190 @@
import { compareBuckets, type ExperimentBucket, type ScenarioCounts } from '../comparison/compare';
function bucket(
name: string,
scenarios: ScenarioCounts[],
categories?: { totals: Record<string, number>; trialTotal: number },
): ExperimentBucket {
return {
experimentName: name,
scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
failureCategoryTotals: categories?.totals,
trialTotal: categories?.trialTotal,
};
}
function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
return { testCaseFile: file, scenarioName: scenario, passed, total };
}
describe('compareBuckets', () => {
it('produces a clean intersection when both sides have the same scenarios', () => {
const pr = bucket('pr', [s('contact', 'happy', 8, 10), s('weather', 'happy', 1, 10)]);
const base = bucket('master', [s('contact', 'happy', 9, 10), s('weather', 'happy', 0, 10)]);
const result = compareBuckets(pr, base);
expect(result.scenarios).toHaveLength(2);
expect(result.prOnly).toEqual([]);
expect(result.baselineOnly).toEqual([]);
expect(result.aggregate.intersectionSize).toBe(2);
});
it('flags scenarios only present on one side', () => {
const pr = bucket('pr', [s('contact', 'happy', 5, 10)]);
const base = bucket('master', [s('contact', 'happy', 8, 10), s('weather', 'happy', 5, 10)]);
const result = compareBuckets(pr, base);
expect(result.scenarios).toHaveLength(1);
expect(result.scenarios[0].testCaseFile).toBe('contact');
expect(result.baselineOnly).toEqual([{ testCaseFile: 'weather', scenarioName: 'happy' }]);
expect(result.prOnly).toEqual([]);
});
it('aggregates only over the intersection, not over baseline-only or pr-only', () => {
const pr = bucket('pr', [s('contact', 'happy', 10, 10)]);
const base = bucket('master', [s('contact', 'happy', 5, 10), s('other', 'happy', 0, 10)]);
const result = compareBuckets(pr, base);
expect(result.aggregate.prAggregatePassRate).toBe(1);
expect(result.aggregate.baselineAggregatePassRate).toBe(0.5);
expect(result.aggregate.intersectionSize).toBe(1);
});
it('sorts scenarios with regressions first, then improvements, then stable', () => {
const pr = bucket('pr', [
s('a', 'stable', 10, 10),
s('b', 'regression', 0, 10),
s('c', 'improvement', 10, 10),
]);
const base = bucket('master', [
s('a', 'stable', 10, 10),
s('b', 'regression', 10, 10),
s('c', 'improvement', 0, 10),
]);
const result = compareBuckets(pr, base);
expect(result.scenarios.map((sc) => sc.scenarioName)).toEqual([
'regression',
'improvement',
'stable',
]);
});
it('returns insufficient_data when one side has zero trials for a scenario', () => {
const pr = bucket('pr', [s('contact', 'happy', 0, 0)]);
const base = bucket('master', [s('contact', 'happy', 10, 10)]);
const result = compareBuckets(pr, base);
expect(result.scenarios[0].verdict).toBe('insufficient_data');
});
it('returns no failure-category drift when either side lacks category totals', () => {
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
const base = bucket('master', [s('a', 'happy', 8, 10)]);
expect(compareBuckets(pr, base).failureCategories).toEqual([]);
});
it('flags a category as notable when both rate and trial-count gaps clear the bars', () => {
// Haiku-style shift: framework_issue 0/290 → 9/145.
// Rate gap: 6.2pp ≥ 5pp ✓. Expected PR count given baseline = 0 × (145/290) = 0; |9 0| = 9 ≥ 3 ✓.
const pr = bucket('pr', [s('a', 'happy', 50, 145)], {
totals: { framework_issue: 9 },
trialTotal: 145,
});
const base = bucket('master', [s('a', 'happy', 200, 290)], {
totals: { framework_issue: 0 },
trialTotal: 290,
});
const cats = compareBuckets(pr, base).failureCategories;
const fw = cats.find((c) => c.category === 'framework_issue');
expect(fw?.notable).toBe(true);
});
it('does not flag when the rate gap is below the 5pp bar', () => {
// 3/100 vs 2/100 = 1pp gap, count gap = 1 — neither bar cleared.
const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
totals: { mock_issue: 3 },
trialTotal: 100,
});
const base = bucket('master', [s('a', 'happy', 50, 100)], {
totals: { mock_issue: 2 },
trialTotal: 100,
});
const cats = compareBuckets(pr, base).failureCategories;
expect(cats.find((c) => c.category === 'mock_issue')?.notable).toBe(false);
});
it('does not flag when the rate gap is large but the count gap is tiny (small N guard)', () => {
// PR 1/3 vs baseline 0/270 — rate gap = 33pp ≥ 5pp, but expected PR count = 0
// and observed = 1, count gap = 1 < 3. Should NOT flag — single trial on small N.
const pr = bucket('pr', [s('a', 'happy', 0, 3)], {
totals: { builder_issue: 1 },
trialTotal: 3,
});
const base = bucket('master', [s('a', 'happy', 270, 270)], {
totals: { builder_issue: 0 },
trialTotal: 270,
});
const cats = compareBuckets(pr, base).failureCategories;
expect(cats.find((c) => c.category === 'builder_issue')?.notable).toBe(false);
});
it('drops unknown categories with a console warning, keeps all known categories', () => {
const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
const pr = bucket('pr', [s('a', 'happy', 8, 10)], {
totals: { '-': 5, builder_issue: 2 },
trialTotal: 10,
});
const base = bucket('master', [s('a', 'happy', 8, 10)], {
totals: { builder_issue: 1 },
trialTotal: 10,
});
const cats = compareBuckets(pr, base).failureCategories;
// All five known categories are always present (some at 0/0 — renderer
// drops those). The unknown `-` category is dropped here with a warning.
expect(cats.map((c) => c.category).sort()).toEqual([
'build_failure',
'builder_issue',
'framework_issue',
'mock_issue',
'verification_failure',
]);
expect(warn).toHaveBeenCalledWith(expect.stringContaining('"-"'));
warn.mockRestore();
});
it('sorts notable categories before non-notable, then by absolute delta', () => {
const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
totals: { framework_issue: 10, mock_issue: 4, builder_issue: 25 },
trialTotal: 100,
});
const base = bucket('master', [s('a', 'happy', 50, 100)], {
totals: { framework_issue: 0, mock_issue: 3, builder_issue: 22 },
trialTotal: 100,
});
const cats = compareBuckets(pr, base).failureCategories;
// framework_issue is the only notable one (rate gap 10pp, count gap 10).
expect(cats[0].category).toBe('framework_issue');
expect(cats[0].notable).toBe(true);
expect(cats.slice(1).every((c) => !c.notable)).toBe(true);
});
it('accepts custom tiered thresholds for tests', () => {
const pr = bucket('pr', [s('a', 'happy', 5, 10)]);
const base = bucket('master', [s('a', 'happy', 8, 10)]);
// Defaults: 5/10 vs 8/10 = -30pp drop, p ≈ 0.18 → soft_regression
// (passes soft maxPValue=0.20, soft minDelta=0.15, baseline 80% above soft 50%).
const defaults = compareBuckets(pr, base);
expect(defaults.scenarios[0].verdict).toBe('soft_regression');
// Stricter soft p-value cutoff excludes this case.
const stricter = compareBuckets(pr, base, {
soft: { maxPValue: 0.1, minDelta: 0.15, minBaselinePassRate: 0.5 },
});
expect(['stable', 'watch']).toContain(stricter.scenarios[0].verdict);
});
});

View File

@ -0,0 +1,458 @@
import {
compareBuckets,
type ComparisonOutcome,
type ComparisonResult,
type ExperimentBucket,
type ScenarioCounts,
} from '../comparison/compare';
import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
import type { MultiRunEvaluation, WorkflowTestCase, ScenarioResult } from '../types';
function ok(result: ComparisonResult): ComparisonOutcome {
return { kind: 'ok', result };
}
function slugMap(evaluation: MultiRunEvaluation, slugs: string[]): Map<WorkflowTestCase, string> {
return new Map(evaluation.testCases.map((tc, i) => [tc.testCase, slugs[i] ?? 'unknown']));
}
function bucket(name: string, scenarios: ScenarioCounts[]): ExperimentBucket {
return {
experimentName: name,
scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
};
}
function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
return { testCaseFile: file, scenarioName: scenario, passed, total };
}
/** Minimal evaluation fixture matching the shape format.ts reads. */
function evaluation(
opts: {
totalRuns?: number;
testCases?: Array<{
prompt?: string;
buildSuccessCount?: number;
scenarios?: Array<{
name: string;
passCount: number;
passes: boolean[]; // per-iteration pass/fail
reasoning?: string;
failureCategory?: string;
}>;
}>;
} = {},
): MultiRunEvaluation {
const totalRuns = opts.totalRuns ?? 3;
return {
totalRuns,
testCases: (opts.testCases ?? []).map((tc) => {
const testCase = {
prompt: tc.prompt ?? 'Test workflow prompt',
complexity: 'medium' as const,
tags: [],
scenarios: (tc.scenarios ?? []).map((sa) => ({
name: sa.name,
description: '',
dataSetup: '',
successCriteria: '',
})),
} as WorkflowTestCase;
const buildSuccessCount = tc.buildSuccessCount ?? totalRuns;
const scenarios = (tc.scenarios ?? []).map((sa) => ({
scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
passCount: sa.passCount,
passRate: totalRuns > 0 ? sa.passCount / totalRuns : 0,
passAtK: new Array(totalRuns).fill(sa.passCount > 0 ? 1 : 0) as number[],
passHatK: new Array(totalRuns).fill(sa.passCount === totalRuns ? 1 : 0) as number[],
runs: sa.passes.map(
(passed): ScenarioResult => ({
scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
success: passed,
score: passed ? 1 : 0,
reasoning: sa.reasoning ?? '',
failureCategory: !passed ? sa.failureCategory : undefined,
}),
),
}));
return {
testCase,
workflowBuildSuccess: buildSuccessCount > 0,
scenarioResults: [],
scenarios,
runs: new Array(totalRuns).fill(null).map(() => ({
testCase,
workflowBuildSuccess: buildSuccessCount > 0,
scenarioResults: [],
})),
buildSuccessCount,
};
}),
};
}
describe('formatComparisonMarkdown', () => {
const evalFixture = evaluation({
totalRuns: 3,
testCases: [
{
prompt: 'a',
scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
},
],
});
it('renders heading, alert, aggregate, and a regression table', () => {
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
expect(md).toMatch(/### Instance AI Workflow Eval/);
expect(md).toMatch(/> \[!CAUTION\]/);
expect(md).toMatch(/1 regression/);
expect(md).toMatch(/\*\*Aggregate\*\*: 0\.0% PR vs 100\.0% baseline/);
expect(md).toMatch(/#### Regressions \(1\)/);
expect(md).toMatch(/`a\/happy`/);
expect(md).toMatch(/0\/3 \(0%\)/);
expect(md).toMatch(/-100pp ↓/);
});
it('uses TIP alert when there are only improvements', () => {
const pr = bucket('pr', [s('a', 'happy', 3, 3)]);
const base = bucket('master', [s('a', 'happy', 0, 10)]);
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
expect(md).toMatch(/> \[!TIP\]/);
expect(md).toMatch(/1 improvement/);
expect(md).toMatch(/#### Improvements \(1\)/);
expect(md).toMatch(/\+100pp ↑/);
});
it('uses TIP alert with "0 regressions" when everything is stable', () => {
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
const base = bucket('master', [s('a', 'happy', 8, 10)]);
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
expect(md).toMatch(/> \[!TIP\]/);
expect(md).toMatch(/0 regressions/);
expect(md).toMatch(/1 stable/);
expect(md).not.toMatch(/#### Regressions/);
});
it('renders LangSmith-disabled NOTE when outcome is undefined', () => {
const md = formatComparisonMarkdown(evalFixture);
expect(md).toMatch(/> \[!NOTE\]/);
expect(md).toMatch(/LangSmith disabled/);
expect(md).not.toMatch(/#### Regressions/);
});
it('renders distinct alerts per skip reason', () => {
const noBase = formatComparisonMarkdown(evalFixture, { kind: 'no_baseline' });
expect(noBase).toMatch(/> \[!NOTE\]/);
expect(noBase).toMatch(/No baseline configured/);
const selfBase = formatComparisonMarkdown(evalFixture, {
kind: 'self_baseline',
experimentName: 'instance-ai-baseline-abc',
});
expect(selfBase).toMatch(/> \[!NOTE\]/);
expect(selfBase).toMatch(/This run is the baseline/);
expect(selfBase).toMatch(/instance-ai-baseline-abc/);
const fetchFail = formatComparisonMarkdown(evalFixture, {
kind: 'fetch_failed',
error: 'LangSmith 503',
});
// fetch_failed is a real outage, not a benign skip — must be a WARNING.
expect(fetchFail).toMatch(/> \[!WARNING\]/);
expect(fetchFail).toMatch(/Regression detection did not run/);
expect(fetchFail).toMatch(/LangSmith 503/);
});
it('shows mixed-case alert when both regressions and improvements exist', () => {
const pr = bucket('pr', [s('a', 'happy', 0, 3), s('b', 'happy', 3, 3)]);
const base = bucket('master', [s('a', 'happy', 10, 10), s('b', 'happy', 0, 10)]);
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
expect(md).toMatch(/> \[!CAUTION\]/);
expect(md).toMatch(/1 regression/);
expect(md).toMatch(/1 improvement/);
expect(md).toMatch(/#### Regressions/);
expect(md).toMatch(/#### Improvements/);
});
it('embeds commit SHA in heading when provided', () => {
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
const base = bucket('master', [s('a', 'happy', 8, 10)]);
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)), {
commitSha: 'abc1234567890def',
});
expect(md).toMatch(/### Instance AI Workflow Eval — `abc12345`/);
});
it('marks new failure categories with 🆕', () => {
const pr: ExperimentBucket = {
experimentName: 'pr',
scenarios: new Map([['a/happy', { ...s('a', 'happy', 0, 3) }]]),
failureCategoryTotals: { framework_issue: 9 },
trialTotal: 145,
};
const base: ExperimentBucket = {
experimentName: 'master',
scenarios: new Map([['a/happy', { ...s('a', 'happy', 5, 10) }]]),
failureCategoryTotals: { framework_issue: 0 },
trialTotal: 290,
};
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
expect(md).toMatch(/#### Failure breakdown/);
expect(md).toMatch(/`framework_issue` 🆕/);
expect(md).toMatch(/\*\*notable\*\*/);
});
it('always includes all five tier counts in the alert line', () => {
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
const base = bucket('master', [s('a', 'happy', 8, 10)]);
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
expect(md).toMatch(/0 regressions, 0 soft, 0 notable, 0 improvements, 1 stable/);
});
it('renders a per-scenario breakdown collapsible inside the regression section', () => {
const evalWithFailures = evaluation({
totalRuns: 3,
testCases: [
{
prompt: 'a',
scenarios: [
{
name: 'happy',
passCount: 0,
passes: [false, false, false],
reasoning: 'Builder produced an unsupported node configuration',
failureCategory: 'builder_issue',
},
],
},
],
});
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
const base = bucket('master', [s('a', 'happy', 10, 10)]);
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
slugByTestCase: slugMap(evalWithFailures, ['a']),
});
expect(md).toMatch(/#### Regressions \(1\)/);
// The regression row's collapsible should appear inside the Regressions
// section, before the per-test-case section, and carry the same slug.
const regressionsIdx = md.indexOf('#### Regressions');
const perTcIdx = md.indexOf('Per-test-case results');
const breakdownIdx = md.indexOf('<code>a/happy</code>');
expect(breakdownIdx).toBeGreaterThan(regressionsIdx);
expect(breakdownIdx).toBeLessThan(perTcIdx);
expect(md).toMatch(/3 of 3 failed · 3× builder_issue/);
expect(md).toMatch(/Run 1 \[builder_issue\]: Builder produced/);
});
it('uses `file/scenario` slug headers in the bottom Failure details section', () => {
const evalWithFailures = evaluation({
totalRuns: 3,
testCases: [
{
prompt: 'Build a cross-team Linear report digest',
scenarios: [
{
name: 'no-cross-team-issues',
passCount: 0,
passes: [false, false, false],
reasoning: 'reason',
failureCategory: 'builder_issue',
},
],
},
],
});
const pr = bucket('pr', [s('cross-team-linear-report', 'no-cross-team-issues', 0, 3)]);
const base = bucket('master', [s('cross-team-linear-report', 'no-cross-team-issues', 10, 10)]);
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report']),
});
expect(md).toMatch(/<summary>Failure details<\/summary>/);
expect(md).toMatch(/\*\*`cross-team-linear-report\/no-cross-team-issues`\*\* — 3 failed/);
});
it('attaches per-scenario failures to the right file slug when names collide', () => {
// Two test cases each defining `happy-path`. Without the slug map,
// the renderer would conflate them — Albert's review flagged this
// exact bug. With the map, each row's collapsible carries only that
// row's failures.
const evalWithFailures = evaluation({
totalRuns: 3,
testCases: [
{
prompt: 'cross-team prompt',
scenarios: [
{
name: 'happy-path',
passCount: 0,
passes: [false, false, false],
reasoning: 'Linear node misconfigured',
failureCategory: 'builder_issue',
},
],
},
{
prompt: 'weather prompt',
scenarios: [
{
name: 'happy-path',
passCount: 0,
passes: [false, false, false],
reasoning: 'Weather mock returned empty',
failureCategory: 'mock_issue',
},
],
},
],
});
const pr = bucket('pr', [
s('cross-team-linear-report', 'happy-path', 0, 3),
s('weather-monitoring', 'happy-path', 0, 3),
]);
const base = bucket('master', [
s('cross-team-linear-report', 'happy-path', 10, 10),
s('weather-monitoring', 'happy-path', 10, 10),
]);
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report', 'weather-monitoring']),
});
// Each per-scenario collapsible (under the regression table) must show
// ONLY its own failures. Slice each block at its closing </details>.
function collapsibleFor(slug: string): string {
const open = md.indexOf(`<code>${slug}</code>`);
expect(open).toBeGreaterThan(-1);
const close = md.indexOf('</details>', open);
return md.slice(open, close);
}
const crossTeamBlock = collapsibleFor('cross-team-linear-report/happy-path');
const weatherBlock = collapsibleFor('weather-monitoring/happy-path');
expect(crossTeamBlock).toMatch(/Linear node misconfigured/);
expect(crossTeamBlock).not.toMatch(/Weather mock returned empty/);
expect(weatherBlock).toMatch(/Weather mock returned empty/);
expect(weatherBlock).not.toMatch(/Linear node misconfigured/);
});
it('uses the slug instead of the prompt in the per-test-case table', () => {
const evalFx = evaluation({
totalRuns: 3,
testCases: [
{
prompt: 'Build a cross-team Linear report digest from open issues',
scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
},
],
});
const pr = bucket('pr', [s('cross-team-linear-report', 'happy', 0, 3)]);
const base = bucket('master', [s('cross-team-linear-report', 'happy', 10, 10)]);
const md = formatComparisonMarkdown(evalFx, ok(compareBuckets(pr, base)), {
slugByTestCase: slugMap(evalFx, ['cross-team-linear-report']),
});
// Per-test-case table cell should be the slug, not the prompt.
const perTcSection = md.slice(md.indexOf('Per-test-case results'));
expect(perTcSection).toMatch(/`cross-team-linear-report`/);
expect(perTcSection).not.toMatch(/Build a cross-team Linear report digest/);
});
it('skips per-scenario breakdown when slugByTestCase is omitted', () => {
// Without the slug map, the renderer can't disambiguate. We'd rather
// drop the breakdown than show a wrong one.
const evalWithFailures = evaluation({
totalRuns: 3,
testCases: [
{
prompt: 'a',
scenarios: [
{
name: 'happy',
passCount: 0,
passes: [false, false, false],
reasoning: 'Some failure',
failureCategory: 'builder_issue',
},
],
},
],
});
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
const base = bucket('master', [s('a', 'happy', 10, 10)]);
const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)));
// Regression table still rendered.
expect(md).toMatch(/#### Regressions \(1\)/);
// But no per-scenario collapsible (which would have used <code>a/happy</code>
// with the breakdown summary text).
expect(md).not.toMatch(/3 of 3 failed · 3× builder_issue/);
});
it('renders the failure breakdown for non-notable categories with non-zero counts', () => {
// 50/100 vs 50/100 — no scenario regression, but still has builder_issue
// counts on both sides (non-notable but non-zero).
const pr: ExperimentBucket = {
experimentName: 'pr',
scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
failureCategoryTotals: { builder_issue: 25 },
trialTotal: 100,
};
const base: ExperimentBucket = {
experimentName: 'master',
scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
failureCategoryTotals: { builder_issue: 22 },
trialTotal: 100,
};
const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
expect(md).toMatch(/#### Failure breakdown/);
expect(md).toMatch(/`builder_issue`/);
// builder_issue isn't notable here, so no "notable" marker.
expect(md).not.toMatch(/builder_issue.*notable/);
});
});
describe('formatComparisonTerminal', () => {
const evalFixture = evaluation({
totalRuns: 3,
testCases: [
{
prompt: 'a',
scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
},
],
});
it('renders title, verdict, aggregate, and regression table without markdown syntax', () => {
const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
expect(out).toMatch(/^Instance AI Workflow Eval/);
expect(out).toMatch(/▶ 1 regression/);
expect(out).toMatch(/PR\s{8}0\.0%/);
expect(out).toMatch(/baseline\s{2}100\.0%/);
expect(out).toMatch(/REGRESSIONS/);
expect(out).toMatch(/a\/happy/);
expect(out).not.toMatch(/^###/m);
expect(out).not.toMatch(/\| /);
});
it('renders LangSmith-disabled message when outcome is undefined', () => {
const out = formatComparisonTerminal(evalFixture);
expect(out).toMatch(/LangSmith disabled/);
expect(out).not.toMatch(/REGRESSIONS/);
});
it('shows partial banner when scenarios differ on each side', () => {
const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
const base = bucket('master', [s('a', 'happy', 8, 10), s('b', 'happy', 5, 10)]);
const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
expect(out).toMatch(/partial: 1 baseline scenarios not run by PR/);
});
});

View File

@ -0,0 +1,161 @@
import {
classifyScenario,
fishersExactOneSidedLeft,
wilsonInterval,
} from '../comparison/statistics';
describe('fishersExactOneSidedLeft', () => {
it('returns 1 when either row is empty (no information)', () => {
expect(fishersExactOneSidedLeft(0, 0, 5, 5)).toBe(1);
expect(fishersExactOneSidedLeft(5, 5, 0, 0)).toBe(1);
});
it('returns 1 when no failures or no passes are observed (no test possible)', () => {
expect(fishersExactOneSidedLeft(3, 0, 5, 0)).toBe(1);
expect(fishersExactOneSidedLeft(0, 3, 0, 5)).toBe(1);
});
it('matches a known textbook case', () => {
// 2x2 table where PR (1/3) is much worse than baseline (10/10).
// Hypergeometric: P(X = 0) + P(X = 1) | drawn=3 from passes=11, fails=2
// = C(11,0)C(2,3)/C(13,3) + C(11,1)C(2,2)/C(13,3)
// = 0 + 11/286 ≈ 0.03846
const p = fishersExactOneSidedLeft(1, 2, 10, 0);
expect(p).toBeCloseTo(0.03846, 4);
});
it('returns p = 1 when PR pass rate equals baseline at maximum', () => {
// PR all pass, baseline all pass — under H0 the observed PR is the most likely outcome,
// so the left-tail (X ≤ a) p-value is exactly 1.
const p = fishersExactOneSidedLeft(5, 0, 5, 0);
expect(p).toBe(1);
});
it('detects a strong regression with high N', () => {
// PR 0/10, baseline 10/10 — extremely strong evidence PR is worse.
const p = fishersExactOneSidedLeft(0, 10, 10, 0);
expect(p).toBeLessThan(0.001);
});
it('returns 1 when PR matches baseline rates exactly', () => {
// PR 5/10, baseline 5/10 — left tail at the median is around 0.5 + symmetric mass
// at the observed value, but should be > 0.5 (we're at the center of the distribution).
const p = fishersExactOneSidedLeft(5, 5, 5, 5);
expect(p).toBeGreaterThan(0.5);
});
});
describe('wilsonInterval', () => {
it('returns [0, 1] for total=0', () => {
expect(wilsonInterval(0, 0)).toEqual({ lower: 0, upper: 1 });
});
it('produces reasonable bounds for 5/10', () => {
const ci = wilsonInterval(5, 10);
// Known Wilson 95% CI for 5/10: roughly [0.237, 0.763]
expect(ci.lower).toBeCloseTo(0.237, 2);
expect(ci.upper).toBeCloseTo(0.763, 2);
});
it('produces tight bounds for 0/100', () => {
const ci = wilsonInterval(0, 100);
expect(ci.lower).toBe(0);
expect(ci.upper).toBeLessThan(0.05);
});
it('produces tight bounds for 100/100', () => {
const ci = wilsonInterval(100, 100);
// upper analytically equals 1 but lands slightly under it after FP rounding —
// any reasonable CI for 100/100 should still be tight to the top of the range.
expect(ci.upper).toBeGreaterThanOrEqual(0.99);
expect(ci.lower).toBeGreaterThan(0.95);
});
it('throws when passes > total', () => {
expect(() => wilsonInterval(5, 3)).toThrow();
});
});
describe('classifyScenario', () => {
it('flags a clear regression on a reliable scenario as hard_regression', () => {
const result = classifyScenario(0, 10, 10, 10);
expect(result.verdict).toBe('hard_regression');
expect(result.delta).toBe(-1);
});
it('marks a hard-significant drop on an unreliable baseline as unreliable_baseline', () => {
// Baseline 4/10 (40%) — below hard reliable (70%). PR 0/10 is a 40pp drop with
// Fisher p < 0.05. We surface it as `unreliable_baseline` rather than flagging.
const result = classifyScenario(0, 10, 4, 10);
expect(result.verdict).toBe('unreliable_baseline');
});
it('reports stable when the drop is sub-MDE on a flaky baseline', () => {
// Baseline 1/10 (flaky), PR 0/10 — only a 10pp drop, below MDE.
const result = classifyScenario(0, 10, 1, 10);
expect(result.verdict).toBe('stable');
});
it('does not flag a small drop below the soft MDE threshold', () => {
// 9/10 vs 10/10 = 10pp drop, below soft MDE (15pp).
const result = classifyScenario(9, 10, 10, 10);
expect(result.verdict).toBe('stable');
});
it('flags an improvement when PR is significantly better', () => {
const result = classifyScenario(10, 10, 0, 10);
expect(result.verdict).toBe('improvement');
});
it('flags improvement even on a never-passing baseline', () => {
// "Never passes" baseline (0/10) — fix is worth surfacing without the reliability gate.
const result = classifyScenario(8, 10, 0, 10);
expect(result.verdict).toBe('improvement');
});
it('returns insufficient_data when either side has no trials', () => {
expect(classifyScenario(0, 0, 5, 10).verdict).toBe('insufficient_data');
expect(classifyScenario(5, 10, 0, 0).verdict).toBe('insufficient_data');
});
it('flags the most extreme outcome at minimum N as hard_regression', () => {
// PR 0/3 vs baseline 3/3 — Fisher one-sided p ≈ 0.05, delta = -100pp.
const result = classifyScenario(0, 3, 3, 3);
expect(result.verdict).toBe('hard_regression');
});
it('reports stable when N is small enough that even a full flip is sub-significant for soft tier', () => {
// PR 1/2 vs baseline 2/2 — delta -50pp but Fisher p ≈ 0.5 (way above soft α=0.20).
// Soft MDE met, but significance fails on both tiers.
const result = classifyScenario(1, 2, 2, 2);
expect(['stable', 'watch']).toContain(result.verdict);
});
it('marks soft regression when hard delta is missed but soft thresholds met', () => {
// 6/10 vs 10/10 = 40pp drop, p ≈ 0.043, baseline 100% reliable.
// Hard defaults would flag this; force a stricter hard delta to push it to soft.
const result = classifyScenario(6, 10, 10, 10, {
hard: { maxPValue: 0.05, minDelta: 0.5, minBaselinePassRate: 0.7 },
soft: { maxPValue: 0.2, minDelta: 0.15, minBaselinePassRate: 0.5 },
});
expect(result.verdict).toBe('soft_regression');
});
it('marks watch when delta crosses the watch threshold without significance', () => {
// 5/10 vs 7/10 = -20pp drop, p ≈ 0.32 — not significant for hard or soft.
// Default watchDelta is 0.35, so this should not be `watch`. Force it via
// a smaller threshold to validate the path.
const result = classifyScenario(5, 10, 7, 10, { watchDelta: 0.15 });
expect(result.verdict).toBe('watch');
});
it('respects custom hard-tier delta override', () => {
// 7/10 vs 10/10 = 30pp delta. Default hard minDelta is 0.3, so this barely qualifies.
// With hard.minDelta 0.4, it drops into `soft_regression` (still passes soft 0.15 minDelta).
// p ≈ 0.105 < soft maxPValue (0.2), so soft fires.
const result = classifyScenario(7, 10, 10, 10, {
hard: { minDelta: 0.4 },
});
expect(result.verdict).toBe('soft_regression');
});
});

View File

@ -45,7 +45,7 @@ export interface CliArgs {
// ---------------------------------------------------------------------------
const cliArgsSchema = z.object({
timeoutMs: z.number().int().positive().default(600_000),
timeoutMs: z.number().int().positive().default(900_000),
baseUrls: z.array(z.string().url()).min(1).default(['http://localhost:5678']),
email: z.string().optional(),
password: z.string().optional(),
@ -104,7 +104,7 @@ interface RawArgs {
function parseRawArgs(argv: string[]): RawArgs {
const result: RawArgs = {
timeoutMs: 600_000,
timeoutMs: 900_000,
baseUrls: ['http://localhost:5678'],
verbose: false,
keepWorkflows: false,

View File

@ -23,6 +23,15 @@ import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
import { LaneAllocator } from './lane-allocator';
import { expandWithIterations, partitionRoundRobin } from './lanes';
import { N8nClient } from '../clients/n8n-client';
import {
compareBuckets,
type ComparisonOutcome,
type ComparisonResult,
type ExperimentBucket,
type ScenarioCounts,
} from '../comparison/compare';
import { fetchBaselineBucket, findLatestBaseline } from '../comparison/fetch-baseline';
import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
import type { WorkflowTestCaseWithFile } from '../data/workflows';
@ -43,6 +52,7 @@ import type {
MultiRunEvaluation,
ScenarioResult,
TestScenario,
WorkflowTestCase,
WorkflowTestCaseResult,
} from '../types';
@ -160,21 +170,40 @@ async function main(): Promise<void> {
const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);
let evaluation: MultiRunEvaluation;
let experimentName: string | undefined;
let outcome: ComparisonOutcome | undefined;
let slugByTestCase: Map<WorkflowTestCase, string> | undefined;
if (hasLangSmith) {
logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
evaluation = await runWithLangSmith({ args, lanes, logger });
const langsmithRun = await runWithLangSmith({ args, lanes, logger });
evaluation = langsmithRun.evaluation;
experimentName = langsmithRun.experimentName;
outcome = langsmithRun.outcome;
slugByTestCase = langsmithRun.slugByTestCase;
} else {
logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
evaluation = await runDirectLoop({ args, lanes, logger });
}
const totalDuration = Date.now() - startTime;
const outputPath = writeEvalResults(evaluation, totalDuration, args.outputDir);
console.log(`Results: ${outputPath}`);
const commitSha = process.env.LANGSMITH_REVISION_ID ?? process.env.GITHUB_SHA;
const { jsonPath, prCommentPath } = writeEvalResults(
evaluation,
totalDuration,
args.outputDir,
experimentName,
outcome,
commitSha,
slugByTestCase,
);
console.log(`Results: ${jsonPath}`);
console.log(`PR comment: ${prCommentPath}`);
const htmlPath = writeWorkflowReport(flattenRunsForReport(evaluation));
console.log(`Report: ${htmlPath}`);
printSummary(evaluation);
console.log(`Report: ${htmlPath}`);
console.log(
'\n' + formatComparisonTerminal(evaluation, outcome, { commitSha, slugByTestCase }),
);
} finally {
await Promise.all(
lanes.map(async (lane) => {
@ -188,7 +217,12 @@ async function main(): Promise<void> {
// LangSmith mode: evaluate() with dataset sync, tracing, experiments
// ---------------------------------------------------------------------------
async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation> {
async function runWithLangSmith(config: RunConfig): Promise<{
evaluation: MultiRunEvaluation;
experimentName: string;
outcome: ComparisonOutcome;
slugByTestCase: Map<WorkflowTestCase, string>;
}> {
const { args, lanes, logger } = config;
const lsClient = new Client();
@ -466,7 +500,24 @@ async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation>
logger,
});
return evaluation;
const outcome = await tryRunComparison({
lsClient,
prExperimentName: experimentResults.experimentName,
evaluation,
testCasesWithFiles,
logger,
});
const slugByTestCase = new Map<WorkflowTestCase, string>(
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
);
return {
evaluation,
experimentName: experimentResults.experimentName,
outcome,
slugByTestCase,
};
} finally {
if (!args.keepWorkflows) {
await Promise.all(
@ -826,15 +877,22 @@ function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
function writeEvalResults(
evaluation: MultiRunEvaluation,
duration: number,
outputDir?: string,
): string {
outputDir: string | undefined,
experimentName: string | undefined,
outcome: ComparisonOutcome | undefined,
commitSha: string | undefined,
slugByTestCase: Map<WorkflowTestCase, string> | undefined,
): { jsonPath: string; prCommentPath: string } {
const { totalRuns, testCases } = evaluation;
const metrics = computeAggregateMetrics(evaluation);
const result = outcome?.kind === 'ok' ? outcome.result : undefined;
const report = {
timestamp: new Date().toISOString(),
duration,
totalRuns,
experimentName,
summary: {
testCases: testCases.length,
built: metrics.built,
@ -843,6 +901,19 @@ function writeEvalResults(
passHatK: metrics.passHatK,
passRatePerIter: metrics.passRatePerIter,
},
// Structured comparison payload only — the rendered markdown lives in
// the sibling `eval-pr-comment.md` file so consumers can pick the format
// they want without re-running the eval. `comparisonStatus` records why
// the comparison was skipped when applicable, so JSON consumers can
// distinguish "no baseline yet" from "regression detection broke".
comparison: result
? {
baseline: result.baseline.experimentName,
result: serializeComparison(result),
}
: undefined,
comparisonStatus: outcome?.kind ?? 'not_attempted',
comparisonError: outcome?.kind === 'fetch_failed' ? outcome.error : undefined,
testCases: testCases.map((tc) => ({
name: tc.testCase.prompt.slice(0, 70),
buildSuccessCount: tc.buildSuccessCount,
@ -868,74 +939,137 @@ function writeEvalResults(
const targetDir = outputDir ?? process.cwd();
mkdirSync(targetDir, { recursive: true });
const outputPath = join(targetDir, 'eval-results.json');
writeFileSync(outputPath, JSON.stringify(report, null, 2));
return outputPath;
const jsonPath = join(targetDir, 'eval-results.json');
writeFileSync(jsonPath, JSON.stringify(report, null, 2));
// Always write the rendered PR comment — the markdown formatter handles
// both with-comparison and no-baseline cases. CI consumes this file
// directly; local users get a copy-pasteable artifact.
const prCommentPath = join(targetDir, 'eval-pr-comment.md');
writeFileSync(
prCommentPath,
formatComparisonMarkdown(evaluation, outcome, { commitSha, slugByTestCase }),
);
return { jsonPath, prCommentPath };
}
/**
* Convert ComparisonResult into a JSON-serializable shape (Maps don't survive
* JSON.stringify by default).
*/
function serializeComparison(result: ComparisonResult): {
pr: { experimentName: string };
baseline: { experimentName: string };
aggregate: ComparisonResult['aggregate'];
scenarios: ComparisonResult['scenarios'];
prOnly: ComparisonResult['prOnly'];
baselineOnly: ComparisonResult['baselineOnly'];
failureCategories: ComparisonResult['failureCategories'];
} {
return {
pr: result.pr,
baseline: result.baseline,
aggregate: result.aggregate,
scenarios: result.scenarios,
prOnly: result.prOnly,
baselineOnly: result.baselineOnly,
failureCategories: result.failureCategories,
};
}
// ---------------------------------------------------------------------------
// Console summary
// Comparison vs the pinned baseline experiment
// ---------------------------------------------------------------------------
function printSummary(evaluation: MultiRunEvaluation): void {
const { totalRuns, testCases } = evaluation;
const multiRun = totalRuns > 1;
const metrics = computeAggregateMetrics(evaluation);
/**
* Best-effort comparison. Returns a tagged outcome so the PR comment can
* distinguish "no baseline yet" / "this run IS the baseline" from a real
* regression-detection outage (LangSmith down, fetch failure). Never throws
* the eval run is not gated on the comparison.
*/
async function tryRunComparison(config: {
lsClient: Client;
prExperimentName: string;
evaluation: MultiRunEvaluation;
testCasesWithFiles: WorkflowTestCaseWithFile[];
logger: EvalLogger;
}): Promise<ComparisonOutcome> {
const { lsClient, prExperimentName, evaluation, testCasesWithFiles, logger } = config;
console.log('\n=== Workflow Eval Results ===\n');
for (const tc of testCases) {
console.log(`${tc.testCase.prompt.slice(0, 70)}...`);
if (multiRun) {
console.log(` Build: ${String(tc.buildSuccessCount)}/${String(totalRuns)} runs`);
} else {
const r = tc.runs[0];
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
console.log(` Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
if (r.buildError) {
console.log(` Error: ${r.buildError.slice(0, 200)}`);
}
try {
const baselineName = await findLatestBaseline(lsClient);
if (!baselineName) {
logger.verbose(
'No baseline experiment found — skipping comparison. ' +
'Run with --experiment-name instance-ai-baseline to create one.',
);
return { kind: 'no_baseline' };
}
if (baselineName === prExperimentName) {
logger.verbose('Current run is the baseline — skipping comparison.');
return { kind: 'self_baseline', experimentName: baselineName };
}
logger.info(`Comparing against baseline: ${baselineName}`);
const baseline = await fetchBaselineBucket(lsClient, baselineName);
const pr = bucketFromEvaluation(evaluation, testCasesWithFiles, prExperimentName);
return { kind: 'ok', result: compareBuckets(pr, baseline) };
} catch (error: unknown) {
const msg = error instanceof Error ? error.message : String(error);
logger.warn(`Comparison vs baseline failed: ${msg}`);
return { kind: 'fetch_failed', error: msg };
}
}
/**
* Project the in-memory MultiRunEvaluation onto the bucket shape used by
* fetchBaselineBucket, keyed by `${fileSlug}/${scenarioName}`.
*
* Looks up `fileSlug` by test case reference rather than array index the
* comparison key depends on getting the right slug, and zipping by index
* silently miscompares if anything ever reorders the aggregate.
*/
function bucketFromEvaluation(
evaluation: MultiRunEvaluation,
testCasesWithFiles: WorkflowTestCaseWithFile[],
experimentName: string,
): ExperimentBucket {
const slugByTestCase = new Map(
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
);
const scenarios = new Map<string, ScenarioCounts>();
const failureCategoryTotals: Record<string, number> = {};
let trialTotal = 0;
for (const tc of evaluation.testCases) {
const fileSlug = slugByTestCase.get(tc.testCase);
if (!fileSlug) {
throw new Error(
`bucketFromEvaluation: no fileSlug for test case "${tc.testCase.prompt.slice(0, 60)}"`,
);
}
const total = tc.runs.length;
for (const sa of tc.scenarios) {
if (multiRun) {
const passAtK = Math.round((sa.passAtK[metrics.kIndex] ?? 0) * 100);
const passHatK = Math.round((sa.passHatK[metrics.kIndex] ?? 0) * 100);
console.log(
` ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` +
` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`,
);
} else {
const sr = sa.runs[0];
const icon = sr.success ? '✓' : '✗';
const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
console.log(
` ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'}${category} (${String(sr.score * 100)}%)`,
);
if (!sr.success) {
const execErrors = sr.evalResult?.errors ?? [];
if (execErrors.length > 0) {
console.log(` Error: ${execErrors.join('; ').slice(0, 200)}`);
}
console.log(` Diagnosis: ${sr.reasoning.slice(0, 200)}`);
const key = `${fileSlug}/${sa.scenario.name}`;
const failureCategories: Record<string, number> = {};
for (const sr of sa.runs) {
trialTotal++;
if (!sr.success && sr.failureCategory) {
failureCategories[sr.failureCategory] = (failureCategories[sr.failureCategory] ?? 0) + 1;
failureCategoryTotals[sr.failureCategory] =
(failureCategoryTotals[sr.failureCategory] ?? 0) + 1;
}
}
scenarios.set(key, {
testCaseFile: fileSlug,
scenarioName: sa.scenario.name,
passed: sa.passCount,
total,
failureCategories,
});
}
console.log('');
}
if (multiRun) {
console.log(
`${String(metrics.built)}/${String(testCases.length)} built | pass@${String(totalRuns)}: ${String(Math.round(metrics.passAtK * 100))}% | pass^${String(totalRuns)}: ${String(Math.round(metrics.passHatK * 100))}% | iterations: ${metrics.passRatePerIter}`,
);
} else {
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
const passed = allScenarios.filter((s) => s.runs[0]?.success).length;
const total = metrics.scenariosTotal;
console.log(
`${String(metrics.built)}/${String(testCases.length)} built | ${String(passed)}/${String(total)} passed (${String(total > 0 ? Math.round((passed / total) * 100) : 0)}%)`,
);
}
return { experimentName, scenarios, failureCategoryTotals, trialTotal };
}
main().catch((error) => {

View File

@ -0,0 +1,333 @@
// ---------------------------------------------------------------------------
// Comparison core: take two experiment buckets, return a ComparisonResult.
//
// Pure function, no I/O. The tier thresholds (p-value cutoff, minimum delta,
// minimum baseline pass rate) live in statistics.ts — there's no CLI knob.
// Tune them there if the false-positive rate drifts.
// ---------------------------------------------------------------------------
import {
classifyScenario,
wilsonInterval,
type ClassifyOptions,
type ScenarioClassification,
type ScenarioVerdict,
} from './statistics';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface ScenarioCounts {
testCaseFile: string;
scenarioName: string;
passed: number;
total: number;
failureCategories?: Record<string, number>;
}
export interface ExperimentBucket {
experimentName: string;
scenarios: Map<string, ScenarioCounts>;
/**
* Aggregated failure-category counts across all trials in all scenarios.
* Used for the run-level failure-category drift table orthogonal to
* per-scenario verdicts.
*/
failureCategoryTotals?: Record<string, number>;
trialTotal?: number;
}
export interface ScenarioComparison extends ScenarioClassification {
testCaseFile: string;
scenarioName: string;
prPasses: number;
prTotal: number;
baselinePasses: number;
baselineTotal: number;
}
export interface AggregateComparison {
intersectionSize: number;
prAggregatePassRate: number;
baselineAggregatePassRate: number;
prAggregateCI: { lower: number; upper: number };
baselineAggregateCI: { lower: number; upper: number };
delta: number;
}
export interface FailureCategoryComparison {
category: string;
prCount: number;
prRate: number; // count / trialTotal
baselineCount: number;
baselineRate: number;
delta: number; // prRate baselineRate
notable: boolean;
}
export interface ComparisonResult {
pr: { experimentName: string };
baseline: { experimentName: string };
aggregate: AggregateComparison;
scenarios: ScenarioComparison[];
prOnly: Array<{ testCaseFile: string; scenarioName: string }>;
baselineOnly: Array<{ testCaseFile: string; scenarioName: string }>;
failureCategories: FailureCategoryComparison[];
}
/**
* Result of a comparison attempt. The `kind` field distinguishes between
* "ran successfully", "skipped intentionally" (no baseline yet, current run
* IS the baseline), and "failed unexpectedly" (LangSmith API error, fetch
* timeout, etc.). The PR comment renders a different alert per kind so
* readers can tell a missing baseline from a regression-detection outage.
*/
export type ComparisonOutcome =
| { kind: 'ok'; result: ComparisonResult }
| { kind: 'no_baseline' }
| { kind: 'self_baseline'; experimentName: string }
| { kind: 'fetch_failed'; error: string };
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/** Hard regressions only — high-confidence, gating-grade flags. */
export function hardRegressions(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'hard_regression');
}
/** Soft regressions — looser thresholds, worth investigating but not gating. */
export function softRegressions(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'soft_regression');
}
/** Movement ≥ watchDelta without reaching a flag tier. Visibility only. */
export function watchList(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'watch');
}
export function improvements(result: ComparisonResult): ScenarioComparison[] {
return result.scenarios.filter((s) => s.verdict === 'improvement');
}
export function byVerdict(result: ComparisonResult): Record<ScenarioVerdict, number> {
const counts: Record<ScenarioVerdict, number> = {
hard_regression: 0,
soft_regression: 0,
watch: 0,
improvement: 0,
stable: 0,
unreliable_baseline: 0,
insufficient_data: 0,
};
for (const s of result.scenarios) counts[s.verdict]++;
return counts;
}
// ---------------------------------------------------------------------------
// Compare
// ---------------------------------------------------------------------------
/**
* Compare two experiment buckets and produce a structured comparison result.
*
* Aggregate is computed over the *intersection* of scenarios the only
* scenarios for which the rates are directly comparable. PR-only and
* baseline-only scenarios are surfaced separately, not folded into the
* aggregate.
*
* Aggregate pass rate is the *micro* average total passes / total trials
* across the intersection.
*
* `options` exists for tests; production callers pass nothing.
*/
export function compareBuckets(
pr: ExperimentBucket,
baseline: ExperimentBucket,
options: ClassifyOptions = {},
): ComparisonResult {
const scenarios: ScenarioComparison[] = [];
const prOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
const baselineOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
let prIPasses = 0;
let prITotal = 0;
let baseIPasses = 0;
let baseITotal = 0;
for (const [key, prCounts] of pr.scenarios) {
const baseCounts = baseline.scenarios.get(key);
if (!baseCounts) {
prOnly.push({
testCaseFile: prCounts.testCaseFile,
scenarioName: prCounts.scenarioName,
});
continue;
}
prIPasses += prCounts.passed;
prITotal += prCounts.total;
baseIPasses += baseCounts.passed;
baseITotal += baseCounts.total;
const classification = classifyScenario(
prCounts.passed,
prCounts.total,
baseCounts.passed,
baseCounts.total,
options,
);
scenarios.push({
testCaseFile: prCounts.testCaseFile,
scenarioName: prCounts.scenarioName,
prPasses: prCounts.passed,
prTotal: prCounts.total,
baselinePasses: baseCounts.passed,
baselineTotal: baseCounts.total,
...classification,
});
}
for (const [key, baseCounts] of baseline.scenarios) {
if (!pr.scenarios.has(key)) {
baselineOnly.push({
testCaseFile: baseCounts.testCaseFile,
scenarioName: baseCounts.scenarioName,
});
}
}
const aggregate: AggregateComparison = {
intersectionSize: scenarios.length,
prAggregatePassRate: rate(prIPasses, prITotal),
baselineAggregatePassRate: rate(baseIPasses, baseITotal),
prAggregateCI: wilsonInterval(prIPasses, prITotal),
baselineAggregateCI: wilsonInterval(baseIPasses, baseITotal),
delta: rate(prIPasses, prITotal) - rate(baseIPasses, baseITotal),
};
scenarios.sort(scenarioComparator);
const failureCategories = compareFailureCategories(pr, baseline);
return {
pr: { experimentName: pr.experimentName },
baseline: { experimentName: baseline.experimentName },
aggregate,
scenarios,
prOnly,
baselineOnly,
failureCategories,
};
}
// ---------------------------------------------------------------------------
// Failure-category drift
// ---------------------------------------------------------------------------
/** Min absolute rate gap to consider a category notable (5 percentage points). */
const CATEGORY_NOTABLE_RATE_DELTA = 0.05;
/** Min absolute trial-count gap (over scaling) required alongside the rate gap. */
const CATEGORY_NOTABLE_COUNT_DELTA = 3;
/**
* Categories the verifier is supposed to emit. Anything else (malformed
* strings like `-`, `>builder_issue`, empty, etc.) is dropped from the
* comparison so the PR comment doesn't display verifier noise. Keep in sync
* with the verifier's category enum; unknown values are logged at verbose
* level via the console (see compareFailureCategories).
*/
const KNOWN_FAILURE_CATEGORIES = new Set([
'builder_issue',
'mock_issue',
'framework_issue',
'verification_failure',
'build_failure',
]);
function isCategoryNotable(
prCount: number,
prTotal: number,
baselineCount: number,
baselineTotal: number,
): boolean {
const rateGap = Math.abs(prCount / prTotal - baselineCount / baselineTotal);
if (rateGap < CATEGORY_NOTABLE_RATE_DELTA) return false;
const expectedPrCount = baselineCount * (prTotal / baselineTotal);
const countGap = Math.abs(prCount - expectedPrCount);
return countGap >= CATEGORY_NOTABLE_COUNT_DELTA;
}
function compareFailureCategories(
pr: ExperimentBucket,
baseline: ExperimentBucket,
): FailureCategoryComparison[] {
if (!pr.failureCategoryTotals || !baseline.failureCategoryTotals) return [];
const prTotal = pr.trialTotal ?? 0;
const baseTotal = baseline.trialTotal ?? 0;
if (prTotal === 0 || baseTotal === 0) return [];
// Surface unrecognised values so we notice when the verifier adds a new
// category (or starts emitting noise we should clean up). Doesn't enter
// the comparison output; the renderer only knows about KNOWN_FAILURE_CATEGORIES.
for (const category of Object.keys(pr.failureCategoryTotals)) {
if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
}
}
for (const category of Object.keys(baseline.failureCategoryTotals)) {
if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
}
}
// Always emit a row for every known category, even if both sides are 0.
// The renderer can decide whether to suppress 0/0 rows; this gives readers
// a complete picture of the failure-type taxonomy by default.
const out: FailureCategoryComparison[] = [];
for (const category of KNOWN_FAILURE_CATEGORIES) {
const prCount = pr.failureCategoryTotals[category] ?? 0;
const baselineCount = baseline.failureCategoryTotals[category] ?? 0;
out.push({
category,
prCount,
prRate: prCount / prTotal,
baselineCount,
baselineRate: baselineCount / baseTotal,
delta: prCount / prTotal - baselineCount / baseTotal,
notable: isCategoryNotable(prCount, prTotal, baselineCount, baseTotal),
});
}
// Sort: notable first, then by absolute delta descending.
out.sort((a, b) => {
if (a.notable !== b.notable) return a.notable ? -1 : 1;
return Math.abs(b.delta) - Math.abs(a.delta);
});
return out;
}
function rate(passes: number, total: number): number {
return total > 0 ? passes / total : 0;
}
const VERDICT_ORDER: Record<ScenarioComparison['verdict'], number> = {
hard_regression: 0,
soft_regression: 1,
improvement: 2,
watch: 3,
unreliable_baseline: 4,
stable: 5,
insufficient_data: 6,
};
function scenarioComparator(a: ScenarioComparison, b: ScenarioComparison): number {
const av = VERDICT_ORDER[a.verdict];
const bv = VERDICT_ORDER[b.verdict];
if (av !== bv) return av - bv;
const fileCmp = a.testCaseFile.localeCompare(b.testCaseFile);
if (fileCmp !== 0) return fileCmp;
return a.scenarioName.localeCompare(b.scenarioName);
}

View File

@ -0,0 +1,123 @@
// ---------------------------------------------------------------------------
// Find and fetch the pinned baseline experiment from LangSmith.
//
// The baseline is whichever experiment most recently used the
// `instance-ai-baseline` prefix. To refresh, run the eval with that prefix:
//
// pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
//
// LangSmith appends a random suffix, so successive baseline runs become
// `instance-ai-baseline-7abc1234`, `instance-ai-baseline-9def5678`, etc.
// We pick the most recently started one.
//
// Two functions, both small:
//
// findLatestBaseline — list baseline-prefixed projects, pick newest.
// fetchBaselineBucket — read its root runs, bucket per scenario.
//
// Both throw on transport errors. Callers are expected to swallow with a log:
// the comparison is advisory and shouldn't fail the eval run.
// ---------------------------------------------------------------------------
import type { Client } from 'langsmith';
import { z } from 'zod';
import type { ExperimentBucket, ScenarioCounts } from './compare';
/**
* Prefix the latest-baseline lookup matches against. The CLI flag
* `--experiment-name instance-ai-baseline` produces project names like
* `instance-ai-baseline-7abc1234` (LangSmith appends a hyphen + suffix), so
* the constant must end in `-` to avoid matching unrelated names that
* happen to start with `instance-ai-baseline...`.
*/
export const BASELINE_EXPERIMENT_PREFIX = 'instance-ai-baseline-';
const inputsSchema = z
.object({
testCaseFile: z.string().default(''),
scenarioName: z.string().default(''),
})
.passthrough();
const outputsSchema = z
.object({
passed: z.boolean().default(false),
failureCategory: z.string().optional(),
})
.passthrough();
/**
* Return the most recently created baseline experiment, or `undefined` if
* none exist. We pick by `start_time` so a re-run of an older snapshot
* doesn't displace the latest one.
*/
export async function findLatestBaseline(client: Client): Promise<string | undefined> {
let latest: { name: string; ts: number } | undefined;
for await (const project of client.listProjects({ nameContains: BASELINE_EXPERIMENT_PREFIX })) {
const name = project.name;
if (!name?.startsWith(BASELINE_EXPERIMENT_PREFIX)) continue;
const ts = project.start_time ? new Date(project.start_time).getTime() : 0;
if (!latest || ts > latest.ts) latest = { name, ts };
}
return latest?.name;
}
/**
* Fetch a baseline experiment's per-scenario pass/fail counts. Each root run
* corresponds to one (testCaseFile, scenarioName, iteration) triple we
* bucket by `${testCaseFile}/${scenarioName}` and accumulate.
*
* Throws if the project does not exist.
*/
export async function fetchBaselineBucket(
client: Client,
experimentName: string,
): Promise<ExperimentBucket> {
const project = await client.readProject({ projectName: experimentName });
const scenarios = new Map<string, ScenarioCounts>();
const failureCategoryTotals: Record<string, number> = {};
let trialTotal = 0;
for await (const run of client.listRuns({ projectId: project.id, isRoot: true })) {
const inputs = inputsSchema.safeParse(run.inputs ?? {});
if (!inputs.success || !inputs.data.testCaseFile || !inputs.data.scenarioName) continue;
// Skip runs that never produced outputs (still running, crashed before
// completion, infra error). Without this guard, every field defaults
// (passed → false) would coerce them into "failed" trials and inflate
// the baseline failure count. Mirrors `parseTargetOutput` in cli/index.ts.
const rawOutputs = run.outputs;
if (
rawOutputs === null ||
rawOutputs === undefined ||
typeof rawOutputs !== 'object' ||
Object.keys(rawOutputs).length === 0
) {
continue;
}
const outputs = outputsSchema.safeParse(rawOutputs);
if (!outputs.success) continue;
const key = `${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
const existing: ScenarioCounts = scenarios.get(key) ?? {
testCaseFile: inputs.data.testCaseFile,
scenarioName: inputs.data.scenarioName,
passed: 0,
total: 0,
failureCategories: {},
};
existing.total++;
trialTotal++;
if (outputs.data.passed) {
existing.passed++;
} else if (outputs.data.failureCategory) {
const cat = outputs.data.failureCategory;
existing.failureCategories = existing.failureCategories ?? {};
existing.failureCategories[cat] = (existing.failureCategories[cat] ?? 0) + 1;
failureCategoryTotals[cat] = (failureCategoryTotals[cat] ?? 0) + 1;
}
scenarios.set(key, existing);
}
return { experimentName, scenarios, failureCategoryTotals, trialTotal };
}

View File

@ -0,0 +1,961 @@
// ---------------------------------------------------------------------------
// Render the eval run as a PR comment (markdown) or a console summary
// (aligned plain text). Both formats are driven by:
//
// - MultiRunEvaluation — pass rates, build counts, per-trial reasoning
// - ComparisonOutcome (optional) — tagged result of the baseline
// comparison: `ok` (ran, has scenarios), `no_baseline` (skipped), or
// `fetch_failed` / `self_baseline` (skipped for cause). Each kind
// drives a distinct top-of-comment alert so a LangSmith outage doesn't
// get dressed up as "no baseline configured".
//
// When no comparison is available (no baseline yet, LangSmith offline)
// the renderers still produce a useful per-test-case summary. When a
// comparison is available, sections render in priority order:
// regressions, soft regressions, notable movement, improvements,
// failure-category drift. Only sections with content are emitted.
// ---------------------------------------------------------------------------
import {
hardRegressions,
improvements,
softRegressions,
watchList,
type ComparisonOutcome,
type ComparisonResult,
type FailureCategoryComparison,
type ScenarioComparison,
} from './compare';
import type {
MultiRunEvaluation,
TestCaseAggregation,
WorkflowTestCase,
WorkflowTestCaseResult,
} from '../types';
interface FormatOptions {
/** Optional commit SHA to include in the heading. Truncated to 8 chars. */
commitSha?: string;
/** Maps each test-case reference to its file slug. When provided, the
* per-scenario failure breakdown looks up failed runs by
* `${fileSlug}/${scenarioName}` deterministic across collisions like
* multiple `happy-path` scenarios. When omitted, the breakdown is
* skipped (no name-only fallback that lookup was wrong on real data). */
slugByTestCase?: Map<WorkflowTestCase, string>;
}
// ---------------------------------------------------------------------------
// Markdown PR comment
// ---------------------------------------------------------------------------
export function formatComparisonMarkdown(
evaluation: MultiRunEvaluation,
outcome?: ComparisonOutcome,
options: FormatOptions = {},
): string {
const lines: string[] = [];
const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
lines.push(formatHeading(options.commitSha));
lines.push('');
lines.push(formatTopAlert(outcome));
lines.push('');
lines.push(formatAggregateBlock(evaluation, comparison));
lines.push('');
if (comparison) {
const hard = hardRegressions(comparison);
const soft = softRegressions(comparison);
const watch = watchList(comparison);
const imps = improvements(comparison);
const renderedAnyTable = hard.length > 0 || soft.length > 0 || imps.length > 0;
// Built once and reused across the regression-tier sections so each
// scenario row can carry a collapsible breakdown of its failed PR runs.
// Improvements skip the breakdown — they passed. Skipped entirely when
// the caller didn't pass a slug map (lookup would be ambiguous).
const failedIndex = options.slugByTestCase
? buildFailedRunsIndex(evaluation, options.slugByTestCase)
: undefined;
if (hard.length > 0) {
lines.push(
...renderScenarioSection('Regressions', '— high-confidence', hard, true, failedIndex),
);
}
if (soft.length > 0) {
lines.push(
...renderScenarioSection(
'Soft regressions',
'— investigate if related to your changes',
soft,
true,
failedIndex,
),
);
}
if (watch.length > 0) {
lines.push(
...renderScenarioSection(
'Notable movement',
'— large gap, no statistical flag',
watch,
false,
failedIndex,
),
);
}
if (imps.length > 0) {
lines.push(...renderScenarioSection('Improvements', '', imps, true));
}
if (renderedAnyTable) {
lines.push(
"_p = Fisher's exact one-sided p-value. Lower = stronger evidence of a real change._",
);
lines.push('');
}
// Always render the breakdown when comparison data is available — the
// renderer drops 0/0 rows itself, so empty categories don't pollute
// the output but the reader still sees the full taxonomy of what's
// tracked.
lines.push(...renderFailureCategorySection(comparison.failureCategories));
}
lines.push(...renderPerTestCaseDetails(evaluation, options.slugByTestCase));
if (comparison) {
const otherFindings = renderOtherFindings(comparison);
if (otherFindings.length > 0) lines.push(...otherFindings);
}
const failureDetails = renderFailureDetails(evaluation, options.slugByTestCase);
if (failureDetails.length > 0) lines.push(...failureDetails);
return lines.join('\n');
}
function formatHeading(commitSha?: string): string {
const sha = commitSha ? `\`${commitSha.slice(0, 8)}\`` : '';
return `### Instance AI Workflow Eval${sha}`;
}
function formatTopAlert(outcome?: ComparisonOutcome): string {
if (!outcome) {
return ['> [!NOTE]', '> No baseline comparison ran (LangSmith disabled for this run).'].join(
'\n',
);
}
if (outcome.kind === 'no_baseline') {
return [
'> [!NOTE]',
'> No baseline configured — comparison skipped. Run the eval with `--experiment-name instance-ai-baseline` on master to create one.',
].join('\n');
}
if (outcome.kind === 'self_baseline') {
return [
'> [!NOTE]',
`> This run is the baseline (\`${outcome.experimentName}\`) — nothing to compare against.`,
].join('\n');
}
if (outcome.kind === 'fetch_failed') {
return [
'> [!WARNING]',
`> Regression detection did not run — baseline fetch failed: ${outcome.error}`,
].join('\n');
}
const comparison = outcome.result;
const hard = hardRegressions(comparison).length;
const soft = softRegressions(comparison).length;
const watch = watchList(comparison).length;
const imps = improvements(comparison).length;
const stable = countByVerdict(comparison, 'stable');
const aggDelta = comparison.aggregate.delta * 100;
const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
// Always include all five tier counts so readers see what's being tracked,
// not just what's > 0. The hard count is bolded when nonzero for emphasis.
const summary = [
hard > 0 ? `**${hard} regression${hard === 1 ? '' : 's'}**` : '0 regressions',
`${soft} soft`,
`${watch} notable`,
`${imps} improvement${imps === 1 ? '' : 's'}`,
`${stable} stable`,
].join(', ');
let icon: string;
let alertKind: 'CAUTION' | 'WARNING' | 'NOTE' | 'TIP';
if (hard > 0) {
icon = '🔴';
alertKind = 'CAUTION';
} else if (soft > 0) {
icon = '🟡';
alertKind = 'WARNING';
} else if (watch > 0) {
icon = '🔵';
alertKind = 'NOTE';
} else {
icon = '🟢';
alertKind = 'TIP';
}
return `> [!${alertKind}]\n> ${icon} ${summary}. Pass rate ${aggDeltaText} vs master.`;
}
function formatAggregateBlock(
evaluation: MultiRunEvaluation,
comparison?: ComparisonResult,
): string {
if (!comparison) {
const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
const rate = total > 0 ? (passed / total) * 100 : 0;
return `**Aggregate**: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`;
}
const { aggregate } = comparison;
const delta = aggregate.delta * 100;
const sign = delta >= 0 ? '+' : '';
const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
const baselineN = inferBaselineN(comparison);
const sampleLine = baselineN
? `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) vs N=${baselineN} (baseline) · baseline: \`${comparison.baseline.experimentName}\`_`
: `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) · baseline: \`${comparison.baseline.experimentName}\`_`;
const partial = comparison.baselineOnly.length + comparison.prOnly.length;
const partialNote =
partial > 0
? `\n_Partial: ${[
comparison.baselineOnly.length > 0
? `${comparison.baselineOnly.length} baseline scenarios not run by PR`
: null,
comparison.prOnly.length > 0
? `${comparison.prOnly.length} PR scenarios have no baseline data (added since baseline captured)`
: null,
]
.filter((s) => s !== null)
.join(', ')}._`
: '';
return [
`**Aggregate**: ${pct(aggregate.prAggregatePassRate)}% PR vs ${pct(aggregate.baselineAggregatePassRate)}% baseline — **${sign}${delta.toFixed(1)}pp${arrow}**`,
sampleLine + partialNote,
].join('\n');
}
function renderScenarioSection(
heading: string,
subtitle: string,
scenarios: ScenarioComparison[],
withPValue: boolean,
failedIndex?: FailedRunsBySlug,
): string[] {
const lines: string[] = [];
const headingLine = subtitle
? `#### ${heading} (${scenarios.length}) ${subtitle}`
: `#### ${heading} (${scenarios.length})`;
lines.push(headingLine);
lines.push('');
if (withPValue) {
lines.push('| Scenario | PR | Baseline | Δ | p |');
lines.push('|---|---|---|---|---|');
} else {
lines.push('| Scenario | PR | Baseline | Δ |');
lines.push('|---|---|---|---|');
}
for (const s of scenarios) {
const cells = [
`\`${s.testCaseFile}/${s.scenarioName}\``,
formatRateCell(s.prPasses, s.prTotal),
formatRateCell(s.baselinePasses, s.baselineTotal),
formatDeltaCell(s.delta),
];
if (withPValue) {
const p = s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft;
cells.push(p.toFixed(3));
}
lines.push(`| ${cells.join(' | ')} |`);
}
lines.push('');
// Per-scenario failure breakdown — one collapsible per row that had failed
// PR runs. Lets the reader drill into each flagged scenario without
// hunting through a separate "Failure details" section.
if (failedIndex) {
for (const s of scenarios) {
const failedRuns = failedIndex.get(`${s.testCaseFile}/${s.scenarioName}`) ?? [];
if (failedRuns.length === 0) continue;
lines.push(...renderScenarioFailureBreakdown(s, failedRuns));
}
}
return lines;
}
function renderScenarioFailureBreakdown(
s: ScenarioComparison,
failedRuns: FailedRunDetail[],
): string[] {
const slug = `${s.testCaseFile}/${s.scenarioName}`;
const categoryMix = summarizeCategories(failedRuns);
const summaryParts = [`${failedRuns.length} of ${s.prTotal} failed`];
if (categoryMix) summaryParts.push(categoryMix);
const lines: string[] = [];
lines.push(`<details><summary><code>${slug}</code> — ${summaryParts.join(' · ')}</summary>`);
lines.push('');
for (const fr of failedRuns) {
const tag = fr.category ? ` [${fr.category}]` : '';
lines.push(`> Run ${fr.runIndex}${tag}: ${fr.reasoning.slice(0, 300)}`);
lines.push('>');
}
// Drop the trailing empty quote line.
if (lines[lines.length - 1] === '>') lines.pop();
lines.push('');
lines.push('</details>');
lines.push('');
return lines;
}
function renderFailureCategorySection(categories: FailureCategoryComparison[]): string[] {
// Drop rows that are 0/0 on both sides — they carry no signal for the
// reader. Categories with non-zero count on either side are kept so the
// reader sees the full picture even if not "notable".
const rows = categories.filter((c) => c.prCount > 0 || c.baselineCount > 0);
if (rows.length === 0) return [];
const lines: string[] = [];
lines.push('#### Failure breakdown');
lines.push('');
lines.push('| Category | PR | Baseline | Δ | |');
lines.push('|---|---|---|---|---|');
for (const c of rows) {
const isNew = c.baselineCount === 0 && c.prCount > 0;
const label = isNew ? `\`${c.category}\` 🆕` : `\`${c.category}\``;
const delta = c.delta * 100;
const sign = delta >= 0 ? '+' : '';
const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
const notableMarker = c.notable ? '**notable**' : '';
lines.push(
`| ${label} | ${c.prCount} (${pct(c.prRate)}%) | ${c.baselineCount} (${pct(c.baselineRate)}%) | ${sign}${delta.toFixed(1)}pp${arrow} | ${notableMarker} |`,
);
}
lines.push('');
return lines;
}
function renderPerTestCaseDetails(
evaluation: MultiRunEvaluation,
slugByTestCase?: Map<WorkflowTestCase, string>,
): string[] {
const { totalRuns, testCases } = evaluation;
if (testCases.length === 0) return [];
const lines: string[] = [];
lines.push(`<details><summary>Per-test-case results (${testCases.length})</summary>`);
lines.push('');
const renderName = (tc: TestCaseAggregation): string => {
const slug = slugByTestCase?.get(tc.testCase);
return slug ? `\`${slug}\`` : `\`${tc.testCase.prompt.slice(0, 70)}\``;
};
if (totalRuns > 1) {
lines.push(`| Workflow | Built | pass@${totalRuns} | pass^${totalRuns} |`);
lines.push('|---|---|---|---|');
for (const tc of testCases) {
const meanPassAtK = tc.scenarios.length
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
const meanPassHatK = tc.scenarios.length
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
lines.push(
`| ${renderName(tc)} | ${tc.buildSuccessCount}/${totalRuns} | ${meanPassAtK}% | ${meanPassHatK}% |`,
);
}
} else {
lines.push('| Workflow | Built | Pass rate |');
lines.push('|---|---|---|');
for (const tc of testCases) {
const built = tc.runs[0]?.workflowBuildSuccess ? '✓' : '✗';
const passed = tc.scenarios.filter((sa) => sa.runs[0]?.success).length;
const total = tc.scenarios.length;
lines.push(`| ${renderName(tc)} | ${built} | ${passed}/${total} |`);
}
}
lines.push('');
lines.push('</details>');
lines.push('');
return lines;
}
function renderOtherFindings(comparison: ComparisonResult): string[] {
const stable = countByVerdict(comparison, 'stable');
const flaky = countByVerdict(comparison, 'unreliable_baseline');
const noData = countByVerdict(comparison, 'insufficient_data');
if (stable === 0 && flaky === 0 && noData === 0) return [];
const summaryParts: string[] = [];
if (flaky > 0) summaryParts.push(`${flaky} on flaky baseline`);
if (noData > 0) summaryParts.push(`${noData} no data`);
if (stable > 0) summaryParts.push(`${stable} stable`);
const summary = summaryParts.join(' · ');
const lines: string[] = [];
lines.push(`<details><summary>Other findings: ${summary}</summary>`);
lines.push('');
const stableScenarios = comparison.scenarios.filter((s) => s.verdict === 'stable');
const flakyScenarios = comparison.scenarios.filter((s) => s.verdict === 'unreliable_baseline');
const noDataScenarios = comparison.scenarios.filter((s) => s.verdict === 'insufficient_data');
if (flakyScenarios.length > 0) {
lines.push('**Confident drop on a flaky baseline (surfaced for visibility, not flagged):**');
lines.push('');
lines.push('| Scenario | PR | Baseline | Δ |');
lines.push('|---|---|---|---|');
for (const s of flakyScenarios) {
lines.push(
`| \`${s.testCaseFile}/${s.scenarioName}\` | ${formatRateCell(s.prPasses, s.prTotal)} | ${formatRateCell(s.baselinePasses, s.baselineTotal)} | ${formatDeltaCell(s.delta)} |`,
);
}
lines.push('');
}
if (noDataScenarios.length > 0) {
lines.push(
`**No data:** ${noDataScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ')}`,
);
lines.push('');
}
if (stableScenarios.length > 0) {
lines.push(`**Stable (${stableScenarios.length}):**`);
lines.push(
stableScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ') + '.',
);
lines.push('');
}
lines.push('</details>');
lines.push('');
return lines;
}
function renderFailureDetails(
evaluation: MultiRunEvaluation,
slugByTestCase?: Map<WorkflowTestCase, string>,
): string[] {
const failed: Array<{
tc: WorkflowTestCaseResult;
fileSlug: string | undefined;
scenarioName: string;
failedRuns: Array<{ category?: string; reasoning: string }>;
}> = [];
for (const tc of evaluation.testCases) {
const fileSlug = slugByTestCase?.get(tc.testCase);
for (const sa of tc.scenarios) {
const failedRuns = sa.runs
.filter((r) => !r.success)
.map((r) => ({ category: r.failureCategory, reasoning: r.reasoning }));
if (failedRuns.length > 0) {
failed.push({ tc: tc.runs[0], fileSlug, scenarioName: sa.scenario.name, failedRuns });
}
}
}
if (failed.length === 0) return [];
const lines: string[] = [];
lines.push('<details><summary>Failure details</summary>');
lines.push('');
for (const { tc, fileSlug, scenarioName, failedRuns } of failed) {
const slug = fileSlug
? `${fileSlug}/${scenarioName}`
: `${tc.testCase.prompt.slice(0, 50).trim()} / ${scenarioName}`;
lines.push(`**\`${slug}\`** — ${failedRuns.length} failed`);
for (const fr of failedRuns) {
const tag = fr.category ? ` [${fr.category}]` : '';
lines.push(`> Run${tag}: ${fr.reasoning.slice(0, 200)}`);
}
lines.push('');
}
lines.push('</details>');
lines.push('');
return lines;
}
// ---------------------------------------------------------------------------
// Per-scenario failure lookup
// ---------------------------------------------------------------------------
//
// The comparison carries per-scenario counts (passed / total) but not the
// underlying reasoning text. The evaluation has the reasoning, but keys
// testCases by reference identity — not by the `testCaseFile` slug used in
// the comparison. The slug map (built in cli/index.ts where the file slugs
// are first known) bridges the two so the lookup is deterministic. Without
// it we'd have to disambiguate by scenarioName alone, which collides on
// reused names (`happy-path` shows up across most workflows).
interface FailedRunDetail {
category?: string;
reasoning: string;
runIndex: number; // 1-based for display
}
type FailedRunsBySlug = Map<string, FailedRunDetail[]>;
function buildFailedRunsIndex(
evaluation: MultiRunEvaluation,
slugByTestCase: Map<WorkflowTestCase, string>,
): FailedRunsBySlug {
const map: FailedRunsBySlug = new Map();
for (const tc of evaluation.testCases) {
const fileSlug = slugByTestCase.get(tc.testCase);
if (!fileSlug) continue; // testCase not in the slug map — skip rather than misattribute
for (const sa of tc.scenarios) {
const failedRuns: FailedRunDetail[] = [];
sa.runs.forEach((r, i) => {
if (!r.success) {
failedRuns.push({
category: r.failureCategory,
reasoning: r.reasoning,
runIndex: i + 1,
});
}
});
if (failedRuns.length > 0) {
map.set(`${fileSlug}/${sa.scenario.name}`, failedRuns);
}
}
}
return map;
}
function summarizeCategories(failedRuns: FailedRunDetail[]): string | undefined {
const counts = new Map<string, number>();
for (const fr of failedRuns) {
if (fr.category) counts.set(fr.category, (counts.get(fr.category) ?? 0) + 1);
}
if (counts.size === 0) return undefined;
return [...counts.entries()]
.sort((a, b) => b[1] - a[1])
.map(([cat, n]) => `${n}× ${cat}`)
.join(', ');
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function pct(rate: number): string {
return (rate * 100).toFixed(1);
}
function formatRateCell(passes: number, total: number): string {
const rate = total > 0 ? Math.round((passes / total) * 100) : 0;
return `${passes}/${total} (${rate}%)`;
}
function formatDeltaCell(delta: number): string {
const pp = delta * 100;
const sign = pp >= 0 ? '+' : '';
const arrow = pp > 0 ? ' ↑' : pp < 0 ? ' ↓' : '';
return `${sign}${pp.toFixed(0)}pp${arrow}`;
}
function countByVerdict(
comparison: ComparisonResult,
verdict: ScenarioComparison['verdict'],
): number {
return comparison.scenarios.filter((s) => s.verdict === verdict).length;
}
/** Best-effort N=baseline iteration count. The comparison only carries trial
* totals per scenario; we infer N from the most-common scenario total since
* the baseline runs every scenario the same number of times. */
function inferBaselineN(comparison: ComparisonResult): number | undefined {
const totals = comparison.scenarios
.filter((s) => s.baselineTotal > 0)
.map((s) => s.baselineTotal);
if (totals.length === 0) return undefined;
const counts = new Map<number, number>();
for (const t of totals) counts.set(t, (counts.get(t) ?? 0) + 1);
let best = totals[0];
let bestCount = 0;
for (const [n, c] of counts) {
if (c > bestCount) {
best = n;
bestCount = c;
}
}
return best;
}
// ---------------------------------------------------------------------------
// Terminal renderer: aligned plain text for the eval CLI's end-of-run print.
// ---------------------------------------------------------------------------
const TERMINAL_INDENT = ' ';
const TERMINAL_TABLE_INDENT = ' ';
export function formatComparisonTerminal(
evaluation: MultiRunEvaluation,
outcome?: ComparisonOutcome,
options: FormatOptions = {},
): string {
const lines: string[] = [];
const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
const titleSuffix = options.commitSha ? `${options.commitSha.slice(0, 8)}` : '';
const title = `Instance AI Workflow Eval${titleSuffix}`;
lines.push(title);
lines.push('═'.repeat(title.length));
lines.push(TERMINAL_INDENT + formatTerminalVerdictLine(outcome));
lines.push('');
lines.push(...formatTerminalAggregate(evaluation, comparison));
lines.push('');
lines.push(...formatTerminalPerTestCase(evaluation, options.slugByTestCase));
if (comparison) {
const hard = hardRegressions(comparison);
const soft = softRegressions(comparison);
const watch = watchList(comparison);
const imps = improvements(comparison);
if (hard.length > 0) {
lines.push(
TERMINAL_INDENT +
'REGRESSIONS (high-confidence: large drop on a reliable scenario, unlikely noise)',
);
lines.push(formatTerminalScenarioTable(hard, true));
lines.push('');
}
if (soft.length > 0) {
lines.push(
TERMINAL_INDENT +
'SOFT REGRESSIONS (likely natural variance — investigate if related to your changes)',
);
lines.push(formatTerminalScenarioTable(soft, true));
lines.push('');
}
if (watch.length > 0) {
lines.push(TERMINAL_INDENT + 'NOTABLE MOVEMENT (large gap, no statistical flag)');
lines.push(formatTerminalScenarioTable(watch, false));
lines.push('');
}
if (imps.length > 0) {
lines.push(TERMINAL_INDENT + 'IMPROVEMENTS');
lines.push(formatTerminalScenarioTable(imps, true));
lines.push('');
}
// Always render the breakdown when comparison data is available — same
// rationale as the markdown side. The terminal table drops 0/0 rows
// itself.
const breakdownRows = comparison.failureCategories.filter(
(c) => c.prCount > 0 || c.baselineCount > 0,
);
if (breakdownRows.length > 0) {
lines.push(TERMINAL_INDENT + 'failure breakdown');
lines.push(formatTerminalCategoryTable(breakdownRows));
lines.push('');
}
// Stable count is already in the verdict line; surface only the rarer
// outcomes here.
const flaky = countByVerdict(comparison, 'unreliable_baseline');
const noData = countByVerdict(comparison, 'insufficient_data');
const otherParts: string[] = [];
if (flaky > 0) otherParts.push(`${flaky} on flaky baseline`);
if (noData > 0) otherParts.push(`${noData} no data`);
if (otherParts.length > 0) {
lines.push(TERMINAL_INDENT + 'other: ' + otherParts.join(' · '));
}
}
return lines.join('\n');
}
function formatTerminalVerdictLine(outcome?: ComparisonOutcome): string {
if (!outcome) return '▶ No baseline comparison ran (LangSmith disabled).';
if (outcome.kind === 'no_baseline') {
return '▶ No baseline configured — comparison skipped.';
}
if (outcome.kind === 'self_baseline') {
return `▶ This run is the baseline (${outcome.experimentName}) — nothing to compare.`;
}
if (outcome.kind === 'fetch_failed') {
return `▶ Regression detection did not run — baseline fetch failed: ${outcome.error}`;
}
const comparison = outcome.result;
const hard = hardRegressions(comparison).length;
const soft = softRegressions(comparison).length;
const watch = watchList(comparison).length;
const imps = improvements(comparison).length;
const stable = countByVerdict(comparison, 'stable');
const aggDelta = comparison.aggregate.delta * 100;
const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
const summary = [
`${hard} regression${hard === 1 ? '' : 's'}`,
`${soft} soft`,
`${watch} notable`,
`${imps} improvement${imps === 1 ? '' : 's'}`,
`${stable} stable`,
].join(', ');
return `${summary}. Pass rate ${aggDeltaText} vs master.`;
}
function formatTerminalAggregate(
evaluation: MultiRunEvaluation,
comparison?: ComparisonResult,
): string[] {
const lines: string[] = [];
if (!comparison) {
const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
const rate = total > 0 ? (passed / total) * 100 : 0;
lines.push(
TERMINAL_INDENT +
`Aggregate: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`,
);
return lines;
}
const { aggregate } = comparison;
const baselineN = inferBaselineN(comparison);
const aggDelta = aggregate.delta * 100;
const sign = aggDelta >= 0 ? '+' : '';
const arrow = aggDelta > 0 ? ' ↑' : aggDelta < 0 ? ' ↓' : '';
lines.push(TERMINAL_INDENT + `Aggregate (${aggregate.intersectionSize} scenarios)`);
lines.push(
TERMINAL_INDENT +
` PR ${pct(aggregate.prAggregatePassRate)}% (N=${evaluation.totalRuns})`,
);
if (baselineN !== undefined) {
lines.push(
TERMINAL_INDENT +
` baseline ${pct(aggregate.baselineAggregatePassRate)}% (N=${baselineN})`,
);
} else {
lines.push(TERMINAL_INDENT + ` baseline ${pct(aggregate.baselineAggregatePassRate)}%`);
}
lines.push(TERMINAL_INDENT + ` Δ ${sign}${aggDelta.toFixed(1)}pp${arrow}`);
if (comparison.baselineOnly.length > 0 || comparison.prOnly.length > 0) {
const partialParts: string[] = [];
if (comparison.baselineOnly.length > 0)
partialParts.push(`${comparison.baselineOnly.length} baseline scenarios not run by PR`);
if (comparison.prOnly.length > 0)
partialParts.push(`${comparison.prOnly.length} PR scenarios have no baseline data`);
lines.push(TERMINAL_INDENT + ` partial: ${partialParts.join(', ')}`);
}
return lines;
}
function formatTerminalPerTestCase(
evaluation: MultiRunEvaluation,
slugByTestCase?: Map<WorkflowTestCase, string>,
): string[] {
const { totalRuns, testCases } = evaluation;
if (testCases.length === 0) return [];
const lines: string[] = [];
const heading = `Per-test-case results (${testCases.length})`;
lines.push(TERMINAL_INDENT + heading);
const nameOf = (tc: TestCaseAggregation, max: number): string => {
const slug = slugByTestCase?.get(tc.testCase);
return slug ?? tc.testCase.prompt.slice(0, max);
};
if (totalRuns > 1) {
const rows = testCases.map((tc) => {
const meanPassAtK =
tc.scenarios.length > 0
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
const meanPassHatK =
tc.scenarios.length > 0
? Math.round(
(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
tc.scenarios.length) *
100,
)
: 0;
return {
name: nameOf(tc, 60),
builds: `${tc.buildSuccessCount}/${totalRuns}`,
passAtK: `${meanPassAtK}%`,
passHatK: `${meanPassHatK}%`,
};
});
const nameW = maxWidth(
rows.map((r) => r.name),
'workflow',
);
const buildsW = maxWidth(
rows.map((r) => r.builds),
'builds',
);
const atKHeader = `pass@${totalRuns}`;
const hatKHeader = `pass^${totalRuns}`;
const atKW = maxWidth(
rows.map((r) => r.passAtK),
atKHeader,
);
const hatKW = maxWidth(
rows.map((r) => r.passHatK),
hatKHeader,
);
lines.push(
TERMINAL_TABLE_INDENT +
`${'workflow'.padEnd(nameW)} ${'builds'.padEnd(buildsW)} ${atKHeader.padStart(atKW)} ${hatKHeader.padStart(hatKW)}`,
);
lines.push(
TERMINAL_TABLE_INDENT +
`${'─'.repeat(nameW)} ${'─'.repeat(buildsW)} ${'─'.repeat(atKW)} ${'─'.repeat(hatKW)}`,
);
for (const r of rows) {
lines.push(
TERMINAL_TABLE_INDENT +
`${r.name.padEnd(nameW)} ${r.builds.padEnd(buildsW)} ${r.passAtK.padStart(atKW)} ${r.passHatK.padStart(hatKW)}`,
);
}
} else {
for (const tc of testCases) {
const r = tc.runs[0];
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
lines.push('');
lines.push(TERMINAL_INDENT + `${nameOf(tc, 70)}`);
lines.push(TERMINAL_INDENT + ` ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
if (r.buildError) lines.push(TERMINAL_INDENT + ` error: ${r.buildError.slice(0, 200)}`);
for (const sa of tc.scenarios) {
const sr = sa.runs[0];
const status = sr.success ? 'PASS' : 'FAIL';
const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
lines.push(TERMINAL_INDENT + ` ${status} ${sr.scenario.name}${category}`);
if (!sr.success) {
const errs = sr.evalResult?.errors ?? [];
if (errs.length > 0) {
lines.push(TERMINAL_INDENT + ` error: ${errs.join('; ').slice(0, 200)}`);
}
lines.push(TERMINAL_INDENT + ` diagnosis: ${sr.reasoning.slice(0, 200)}`);
}
}
}
}
lines.push('');
return lines;
}
function formatTerminalScenarioTable(scenarios: ScenarioComparison[], withPValue: boolean): string {
const names = scenarios.map((s) => `${s.testCaseFile}/${s.scenarioName}`);
const prCells = scenarios.map((s) => `${s.prPasses}/${s.prTotal}`);
const baseCells = scenarios.map((s) => `${s.baselinePasses}/${s.baselineTotal}`);
const deltaCells = scenarios.map((s) => {
const d = s.delta * 100;
const sign = d >= 0 ? '+' : '';
const arrow = d > 0 ? ' ↑' : d < 0 ? ' ↓' : '';
return `${sign}${d.toFixed(0)}pp${arrow}`;
});
const pCells = withPValue
? scenarios.map((s) => (s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft).toFixed(3))
: [];
const nameW = maxWidth(names, 'scenario');
const prW = maxWidth(prCells, 'PR');
const baseW = maxWidth(baseCells, 'baseline');
const deltaW = maxWidth(deltaCells, 'Δ');
const pW = withPValue ? maxWidth(pCells, 'p') : 0;
const headers = [
'scenario'.padEnd(nameW),
'PR'.padEnd(prW),
'baseline'.padEnd(baseW),
'Δ'.padEnd(deltaW),
];
if (withPValue) headers.push('p'.padEnd(pW));
const widths = withPValue ? [nameW, prW, baseW, deltaW, pW] : [nameW, prW, baseW, deltaW];
const sep = widths.map((w) => '─'.repeat(w)).join(' ');
const rows = scenarios.map((_, i) => {
const cells = [
names[i].padEnd(nameW),
prCells[i].padEnd(prW),
baseCells[i].padEnd(baseW),
deltaCells[i].padEnd(deltaW),
];
if (withPValue) cells.push(pCells[i].padEnd(pW));
return TERMINAL_TABLE_INDENT + cells.join(' ');
});
return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
'\n',
);
}
function formatTerminalCategoryTable(cats: FailureCategoryComparison[]): string {
const names = cats.map((c) => {
const isNew = c.baselineCount === 0 && c.prCount > 0;
return c.category + (isNew ? ' 🆕' : '');
});
const prCells = cats.map((c) => `${c.prCount} (${pct(c.prRate)}%)`);
const baseCells = cats.map((c) => `${c.baselineCount} (${pct(c.baselineRate)}%)`);
const deltaCells = cats.map((c) => {
const d = c.delta * 100;
const sign = d >= 0 ? '+' : '';
return `${sign}${d.toFixed(1)}pp`;
});
const nameW = maxWidth(names, 'category');
const prW = maxWidth(prCells, 'PR');
const baseW = maxWidth(baseCells, 'baseline');
const headers = ['category'.padEnd(nameW), 'PR'.padEnd(prW), 'baseline'.padEnd(baseW), 'Δ'];
const sep = [nameW, prW, baseW, maxWidth(deltaCells, 'Δ')].map((w) => '─'.repeat(w)).join(' ');
const rows = cats.map(
(_, i) =>
TERMINAL_TABLE_INDENT +
[
names[i].padEnd(nameW),
prCells[i].padEnd(prW),
baseCells[i].padEnd(baseW),
deltaCells[i],
].join(' '),
);
return [TERMINAL_TABLE_INDENT + headers.join(' '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
'\n',
);
}
function maxWidth(values: string[], header: string): number {
return values.reduce((m, v) => Math.max(m, v.length), header.length);
}

View File

@ -0,0 +1,304 @@
// ---------------------------------------------------------------------------
// Decides whether one scenario's pass rate is meaningfully worse than
// another, at the small sample sizes evals run at (N=3 typically).
//
// Public surface:
// - classifyScenario(prPasses, prTotal, basePasses, baseTotal) — the verdict
// - wilsonInterval(passes, total) — confidence band for a pass rate, used
// for the headline aggregate
//
// The implementation uses Fisher's exact test and the Wilson score interval
// under the hood; both are standard small-sample statistics. You don't need
// to know either to use the public API.
// ---------------------------------------------------------------------------
import { strict as assert } from 'node:assert';
// ---------------------------------------------------------------------------
// Fisher's exact test (one-sided)
//
// Given a 2×2 table of pass/fail counts for PR vs baseline, returns the
// probability of seeing a gap at least as bad as the observed one if the two
// groups actually had the same pass rate. Small return value ⇒ strong
// evidence the PR is worse.
// ---------------------------------------------------------------------------
const logFactorialCache: number[] = [0, 0];
function logFactorial(n: number): number {
for (let i = logFactorialCache.length; i <= n; i++) {
logFactorialCache.push(logFactorialCache[i - 1] + Math.log(i));
}
return logFactorialCache[n];
}
function logBinomial(n: number, k: number): number {
if (k < 0 || k > n) return -Infinity;
return logFactorial(n) - logFactorial(k) - logFactorial(n - k);
}
function hypergeomPmf(nPasses: number, nFails: number, nDrawn: number, k: number): number {
const total = nPasses + nFails;
if (k < Math.max(0, nDrawn - nFails) || k > Math.min(nDrawn, nPasses)) return 0;
return Math.exp(
logBinomial(nPasses, k) + logBinomial(nFails, nDrawn - k) - logBinomial(total, nDrawn),
);
}
/**
* One-sided Fisher's exact test (left tail). Returns the probability that
* PR's pass count would be at most `a` if PR and baseline shared the same
* underlying pass rate. Small value PR is significantly worse.
*
* 2×2 table:
*
* passed failed
* PR | a | b |
* Baseline | c | d |
*
* Returns 1 (no information) when either side has no trials, or when all
* trials passed or all failed.
*/
export function fishersExactOneSidedLeft(a: number, b: number, c: number, d: number): number {
const inputs = [a, b, c, d];
for (const v of inputs) {
assert(
Number.isInteger(v) && v >= 0,
'fishersExactOneSidedLeft requires non-negative integers',
);
}
const nPr = a + b;
const nBase = c + d;
const nPasses = a + c;
const nFails = b + d;
if (nPr === 0 || nBase === 0) return 1;
if (nPasses === 0 || nFails === 0) return 1;
let pValue = 0;
const kMax = Math.min(a, nPasses);
for (let k = 0; k <= kMax; k++) {
pValue += hypergeomPmf(nPasses, nFails, nPr, k);
}
// Clamp to [0, 1] — accumulated FP error can push the sum slightly past 1.
return Math.min(1, Math.max(0, pValue));
}
// ---------------------------------------------------------------------------
// Wilson score interval (95% confidence)
//
// Returns a confidence band for a pass rate that behaves well at small N and
// at extreme rates (close to 0 or 1) — both common in our evals. Used for
// the headline aggregate band only; classification doesn't need it.
// ---------------------------------------------------------------------------
// Standard z-score for a 95% confidence interval. We only ever use 95%, so
// the value is inlined rather than parameterised.
const Z_95 = 1.96;
export function wilsonInterval(passes: number, total: number): { lower: number; upper: number } {
assert(
Number.isInteger(passes) && passes >= 0,
'wilsonInterval: passes must be a non-negative integer',
);
assert(
Number.isInteger(total) && total >= 0,
'wilsonInterval: total must be a non-negative integer',
);
assert(passes <= total, 'wilsonInterval: passes cannot exceed total');
if (total === 0) return { lower: 0, upper: 1 };
const p = passes / total;
const z2 = Z_95 * Z_95;
const denom = 1 + z2 / total;
const center = (p + z2 / (2 * total)) / denom;
const halfWidth = (Z_95 * Math.sqrt((p * (1 - p)) / total + z2 / (4 * total * total))) / denom;
return {
lower: Math.max(0, center - halfWidth),
upper: Math.min(1, center + halfWidth),
};
}
// ---------------------------------------------------------------------------
// Per-scenario classification
//
// Three flag tiers, evaluated in order of strictness:
//
// hard_regression — high-confidence drop on a reliable baseline.
// Gating-grade.
// soft_regression — looser bar; investigate, not gating.
// watch — moved noticeably but didn't pass either flag tier.
// Pure visibility.
//
// Improvements use the hard tier (we don't surface borderline improvements;
// they tend to be noise in the positive direction).
// ---------------------------------------------------------------------------
export type ScenarioVerdict =
| 'hard_regression' // PR is confidently worse, baseline was reliable
| 'soft_regression' // looser bar — worth investigating, not high-confidence
| 'watch' // moved enough to surface but no flag tier triggered
| 'improvement' // PR is significantly better
| 'stable' // no meaningful change
| 'unreliable_baseline' // confident drop but baseline was too flaky to trust
| 'insufficient_data'; // either side had zero trials
export interface ScenarioClassification {
verdict: ScenarioVerdict;
/** PR pass rate (0..1) */
prPassRate: number;
/** Baseline pass rate (0..1) */
baselinePassRate: number;
/** PR rate baseline rate, signed. Negative = PR worse. */
delta: number;
/** Probability the PR is at least this much worse by chance. Lower ⇒ stronger regression evidence. */
pValueLeft: number;
/** Probability the PR is at least this much better by chance. */
pValueRight: number;
}
export interface TierThresholds {
/** Flag only when the chance the gap happened by noise is below this. */
maxPValue: number;
/** Flag only when the absolute pass-rate gap is at least this large (0..1). */
minDelta: number;
/** Flag only when the baseline pass rate was at least this high (0..1). */
minBaselinePassRate: number;
}
export interface ClassifyOptions {
/** Hard-flag thresholds (most strict). Defaults: maxPValue=0.05, minDelta=0.30, minBaselinePassRate=0.70. */
hard?: Partial<TierThresholds>;
/** Soft-flag thresholds (looser). Defaults: maxPValue=0.20, minDelta=0.15, minBaselinePassRate=0.50. */
soft?: Partial<TierThresholds>;
/** Absolute pass-rate change required for a "watch" verdict regardless of significance. Default 0.35. */
watchDelta?: number;
}
const DEFAULT_HARD: TierThresholds = {
maxPValue: 0.05,
minDelta: 0.3,
minBaselinePassRate: 0.7,
};
const DEFAULT_SOFT: TierThresholds = {
maxPValue: 0.2,
minDelta: 0.15,
minBaselinePassRate: 0.5,
};
// Watch threshold: surface scenarios whose pass rate changed by at least 35pp
// without reaching a flag tier. High enough that natural noise on rock-solid
// scenarios (e.g. 2/3 vs 10/10 = 33pp) doesn't crowd the comment.
const DEFAULT_WATCH_DELTA = 0.35;
function meetsThreshold(
pValue: number,
delta: number,
baselineRate: number,
tier: TierThresholds,
direction: 'worse' | 'better',
): boolean {
if (pValue >= tier.maxPValue) return false;
if (direction === 'worse') {
if (delta > -tier.minDelta) return false;
if (baselineRate < tier.minBaselinePassRate) return false;
} else {
if (delta < tier.minDelta) return false;
// Improvements skip the reliability gate — fixing flaky scenarios is a real win.
}
return true;
}
/**
* Classify a single scenario into one of seven verdicts. See ScenarioVerdict
* for the tier semantics.
*
* `options` exists for tests; production callers leave thresholds at defaults.
*/
export function classifyScenario(
prPasses: number,
prTotal: number,
baselinePasses: number,
baselineTotal: number,
options: ClassifyOptions = {},
): ScenarioClassification {
const hard: TierThresholds = { ...DEFAULT_HARD, ...options.hard };
const soft: TierThresholds = { ...DEFAULT_SOFT, ...options.soft };
const watchDelta = options.watchDelta ?? DEFAULT_WATCH_DELTA;
const prPassRate = prTotal > 0 ? prPasses / prTotal : 0;
const baselinePassRate = baselineTotal > 0 ? baselinePasses / baselineTotal : 0;
if (prTotal === 0 || baselineTotal === 0) {
return {
verdict: 'insufficient_data',
prPassRate,
baselinePassRate,
delta: prPassRate - baselinePassRate,
pValueLeft: 1,
pValueRight: 1,
};
}
const a = prPasses;
const b = prTotal - prPasses;
const c = baselinePasses;
const d = baselineTotal - baselinePasses;
const pValueLeft = fishersExactOneSidedLeft(a, b, c, d);
const pValueRight = fishersExactOneSidedLeft(c, d, a, b);
const delta = prPassRate - baselinePassRate;
// Improvement (right tail) — single tier, hard thresholds only
if (meetsThreshold(pValueRight, delta, baselinePassRate, hard, 'better')) {
return { verdict: 'improvement', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
}
// Hard regression — passes all three hard gates
if (meetsThreshold(pValueLeft, delta, baselinePassRate, hard, 'worse')) {
return {
verdict: 'hard_regression',
prPassRate,
baselinePassRate,
delta,
pValueLeft,
pValueRight,
};
}
// Confident drop, but on a baseline too flaky to call a regression.
// Surface as `unreliable_baseline` so it's visible without being a flag.
if (
pValueLeft < hard.maxPValue &&
delta <= -hard.minDelta &&
baselinePassRate < hard.minBaselinePassRate
) {
return {
verdict: 'unreliable_baseline',
prPassRate,
baselinePassRate,
delta,
pValueLeft,
pValueRight,
};
}
// Soft regression — passes the looser gates
if (meetsThreshold(pValueLeft, delta, baselinePassRate, soft, 'worse')) {
return {
verdict: 'soft_regression',
prPassRate,
baselinePassRate,
delta,
pValueLeft,
pValueRight,
};
}
// Watch — meaningful movement but no flag fired. Pure visibility.
if (Math.abs(delta) >= watchDelta) {
return { verdict: 'watch', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
}
return { verdict: 'stable', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
}

View File

@ -28,7 +28,7 @@ import type {
// Constants
// ---------------------------------------------------------------------------
const DEFAULT_TIMEOUT_MS = 600_000;
const DEFAULT_TIMEOUT_MS = 900_000;
const SSE_SETTLE_DELAY_MS = 200;
const POLL_INTERVAL_MS = 500;
const BACKGROUND_TASK_POLL_INTERVAL_MS = 2_000;

View File

@ -39,3 +39,38 @@ export type {
ChecklistItem,
ChecklistResult,
} from './types';
// -- Comparison (regression detection) --
export {
compareBuckets,
byVerdict,
improvements,
hardRegressions,
softRegressions,
watchList,
} from './comparison/compare';
export type {
ComparisonResult,
ScenarioComparison,
ScenarioCounts,
ExperimentBucket,
AggregateComparison,
FailureCategoryComparison,
} from './comparison/compare';
export {
classifyScenario,
fishersExactOneSidedLeft,
wilsonInterval,
} from './comparison/statistics';
export type {
ScenarioVerdict,
ScenarioClassification,
ClassifyOptions,
TierThresholds,
} from './comparison/statistics';
export { formatComparisonMarkdown, formatComparisonTerminal } from './comparison/format';
export {
fetchBaselineBucket,
findLatestBaseline,
BASELINE_EXPERIMENT_PREFIX,
} from './comparison/fetch-baseline';