n8n/packages/@n8n/instance-ai/evaluations/cli/aggregator.ts
José Braulio González Valido 16e5f9572f
feat(ai-builder): Add LangSmith integration for workflow eval tracking (no-changelog) (#28835)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 08:47:02 +00:00

100 lines
2.7 KiB
TypeScript

import type {
WorkflowTestCaseResult,
MultiRunEvaluation,
TestCaseAggregation,
ScenarioAggregation,
} from '../types';
/**
* Binomial coefficient C(n, k). Returns 0 when k > n.
*/
function combinations(n: number, k: number): number {
if (k > n || k < 0) return 0;
if (k === 0 || k === n) return 1;
let result = 1;
for (let i = 0; i < k; i++) {
result = (result * (n - i)) / (i + 1);
}
return Math.round(result);
}
/**
* pass@k = 1 - C(n - c, k) / C(n, k)
* Probability that at least 1 of k randomly chosen samples passes,
* given n total samples of which c passed.
*/
export function passAtK(n: number, c: number, k: number): number {
if (k > n) return 0;
const denominator = combinations(n, k);
if (denominator === 0) return 0;
return 1 - combinations(n - c, k) / denominator;
}
/**
* pass^k = (c/n)^k
* Probability that all k independent attempts pass,
* given observed success rate p = c/n.
*/
export function passHatK(n: number, c: number, k: number): number {
if (n === 0) return 0;
return Math.pow(c / n, k);
}
function computePassMetrics(
n: number,
c: number,
): { passAtKValues: number[]; passHatKValues: number[] } {
const passAtKValues: number[] = [];
const passHatKValues: number[] = [];
for (let k = 1; k <= n; k++) {
passAtKValues.push(passAtK(n, c, k));
passHatKValues.push(passHatK(n, c, k));
}
return { passAtKValues, passHatKValues };
}
export function aggregateResults(
allRunResults: WorkflowTestCaseResult[][],
totalRuns: number,
): MultiRunEvaluation {
const testCaseCount = allRunResults[0].length;
const testCases: TestCaseAggregation[] = [];
for (let tcIdx = 0; tcIdx < testCaseCount; tcIdx++) {
const runs = allRunResults.map((runResults) => runResults[tcIdx]);
const testCase = runs[0].testCase;
const buildSuccessCount = runs.filter((r) => r.workflowBuildSuccess).length;
const scenarioCount = testCase.scenarios.length;
const scenarios: ScenarioAggregation[] = [];
for (let sIdx = 0; sIdx < scenarioCount; sIdx++) {
const scenario = testCase.scenarios[sIdx];
const scenarioRuns = runs.map(
(r) =>
r.scenarioResults[sIdx] ?? {
scenario,
success: false,
score: 0,
reasoning: 'Build failed — scenario not executed',
},
);
const passCount = scenarioRuns.filter((sr) => sr.success).length;
const { passAtKValues, passHatKValues } = computePassMetrics(totalRuns, passCount);
scenarios.push({
scenario,
runs: scenarioRuns,
passCount,
passRate: totalRuns > 0 ? passCount / totalRuns : 0,
passAtK: passAtKValues,
passHatK: passHatKValues,
});
}
testCases.push({ testCase, runs, buildSuccessCount, scenarios });
}
return { totalRuns, testCases };
}