n8n/packages/@n8n/instance-ai/evaluations/cli/aggregator.ts

import type {
	WorkflowTestCaseResult,
	MultiRunEvaluation,
	TestCaseAggregation,
	ScenarioAggregation,
} from '../types';

/**
 * Binomial coefficient C(n, k). Returns 0 when k > n.
 */
function combinations(n: number, k: number): number {
	if (k > n || k < 0) return 0;
	if (k === 0 || k === n) return 1;
	let result = 1;
	for (let i = 0; i < k; i++) {
		result = (result * (n - i)) / (i + 1);
	}
	return Math.round(result);
}

/**
 * pass@k = 1 - C(n - c, k) / C(n, k)
 * Probability that at least 1 of k randomly chosen samples passes,
 * given n total samples of which c passed.
 */
export function passAtK(n: number, c: number, k: number): number {
	if (k > n) return 0;
	const denominator = combinations(n, k);
	if (denominator === 0) return 0;
	return 1 - combinations(n - c, k) / denominator;
}

/**
 * pass^k = (c/n)^k
 * Probability that all k independent attempts pass,
 * given observed success rate p = c/n.
 */
export function passHatK(n: number, c: number, k: number): number {
	if (n === 0) return 0;
	return Math.pow(c / n, k);
}

function computePassMetrics(
	n: number,
	c: number,
): { passAtKValues: number[]; passHatKValues: number[] } {
	const passAtKValues: number[] = [];
	const passHatKValues: number[] = [];
	for (let k = 1; k <= n; k++) {
		passAtKValues.push(passAtK(n, c, k));
		passHatKValues.push(passHatK(n, c, k));
	}
	return { passAtKValues, passHatKValues };
}

export function aggregateResults(
	allRunResults: WorkflowTestCaseResult[][],
	totalRuns: number,
): MultiRunEvaluation {
	const testCaseCount = allRunResults[0].length;
	const testCases: TestCaseAggregation[] = [];

	for (let tcIdx = 0; tcIdx < testCaseCount; tcIdx++) {
		const runs = allRunResults.map((runResults) => runResults[tcIdx]);
		const testCase = runs[0].testCase;
		const buildSuccessCount = runs.filter((r) => r.workflowBuildSuccess).length;

		const scenarioCount = testCase.scenarios.length;
		const scenarios: ScenarioAggregation[] = [];

		for (let sIdx = 0; sIdx < scenarioCount; sIdx++) {
			const scenario = testCase.scenarios[sIdx];
			const scenarioRuns = runs.map(
				(r) =>
					r.scenarioResults[sIdx] ?? {
						scenario,
						success: false,
						score: 0,
						reasoning: 'Build failed — scenario not executed',
					},
			);
			const passCount = scenarioRuns.filter((sr) => sr.success).length;
			const { passAtKValues, passHatKValues } = computePassMetrics(totalRuns, passCount);

			scenarios.push({
				scenario,
				runs: scenarioRuns,
				passCount,
				passRate: totalRuns > 0 ? passCount / totalRuns : 0,
				passAtK: passAtKValues,
				passHatK: passHatKValues,
			});
		}

		testCases.push({ testCase, runs, buildSuccessCount, scenarios });
	}

	return { totalRuns, testCases };
}