n8n/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts

import pLimit from 'p-limit';
import pc from 'picocolors';

import { createProgressBar, updateProgress, displayResults, displayError } from './display.js';
import { basicTestCases, generateTestCases } from '../chains/test-case-generator.js';
import {
	setupTestEnvironment,
	createAgent,
	getConcurrencyLimit,
	shouldGenerateTestCases,
	howManyTestCasesToGenerate,
} from '../core/environment.js';
import { runSingleTest, initializeTestTracking } from '../core/test-runner.js';
import type { TestCase } from '../types/evaluation.js';
import type { TestResult } from '../types/test-result.js';
import {
	calculateTestMetrics,
	calculateCategoryAverages,
	countViolationsByType,
} from '../utils/evaluation-calculator.js';
import { formatHeader, saveEvaluationResults } from '../utils/evaluation-helpers.js';
import { generateMarkdownReport } from '../utils/evaluation-reporter.js';

type CliEvaluationOptions = {
	testCaseFilter?: string; // Optional test case ID to run only a specific test
	testCases?: TestCase[]; // Optional array of test cases to run (if not provided, uses defaults and generation)
	repetitions?: number; // Number of times to run each test (e.g. for cache warming analysis)
};

/**
 * Main CLI evaluation runner that executes all test cases in parallel
 * Supports concurrency control via EVALUATION_CONCURRENCY environment variable
 */
export async function runCliEvaluation(options: CliEvaluationOptions = {}): Promise<void> {
	const { repetitions = 1, testCaseFilter } = options;

	console.log(formatHeader('AI Workflow Builder Full Evaluation', 70));
	if (repetitions > 1) {
		console.log(pc.yellow(`➔ Each test will be run ${repetitions} times for cache analysis`));
	}
	console.log();
	try {
		// Setup test environment
		const { parsedNodeTypes, llm, tracer } = await setupTestEnvironment();

		// Determine test cases to run
		const providedTestCases =
			options.testCases && options.testCases.length > 0 ? options.testCases : undefined;

		let testCases: TestCase[] = providedTestCases ?? basicTestCases;

		if (providedTestCases) {
			console.log(pc.blue(`➔ Loaded ${providedTestCases.length} test cases from CSV`));
		}

		// Filter to single test case if specified
		if (testCaseFilter) {
			const filteredCase = testCases.find((tc) => tc.id === testCaseFilter);
			if (filteredCase) {
				testCases = [filteredCase];
				console.log(pc.blue(`➔ Running single test case: ${filteredCase.name}`));
			} else {
				console.log(pc.red(`❌ Test case '${testCaseFilter}' not found`));
				console.log(pc.dim(`Available test cases: ${testCases.map((tc) => tc.id).join(', ')}`));
				return;
			}
		} else {
			// Optionally generate additional test cases
			if (!providedTestCases && shouldGenerateTestCases()) {
				console.log(pc.blue('➔ Generating additional test cases...'));
				const generatedCases = await generateTestCases(llm, howManyTestCasesToGenerate());
				testCases = [...testCases, ...generatedCases];
			}
		}

		// Get concurrency from environment
		const concurrency = getConcurrencyLimit();
		console.log(pc.dim(`Running ${testCases.length} test cases with concurrency=${concurrency}`));
		console.log();

		const startTime = Date.now();
		const allRepetitionResults: TestResult[][] = [];

		// Run tests for each repetition
		for (let rep = 0; rep < repetitions; rep++) {
			if (repetitions > 1) {
				console.log(pc.cyan(`\n═══ Repetition ${rep + 1}/${repetitions} ═══\n`));
			}

			// Create progress bar for this repetition
			const progressBar = createProgressBar(testCases.length);

			// Create concurrency limiter
			const limit = pLimit(concurrency);

			// Track progress
			let completed = 0;
			const testResults = initializeTestTracking(testCases);

			// Run all test cases in parallel with concurrency limit
			const promises = testCases.map(
				async (testCase) =>
					await limit(async () => {
						updateProgress(progressBar, completed, testCases.length, `Running: ${testCase.name}`);

						// Create a dedicated agent for this test to avoid state conflicts
						const testAgent = createAgent(parsedNodeTypes, llm, tracer);
						const result = await runSingleTest(testAgent, llm, testCase, parsedNodeTypes);

						testResults[testCase.id] = result.error ? 'fail' : 'pass';
						completed++;
						updateProgress(progressBar, completed, testCases.length);
						return result;
					}),
			);

			const results = await Promise.all(promises);
			progressBar.stop();
			allRepetitionResults.push(results);

			// Show brief stats for this repetition if running multiple times
			if (repetitions > 1) {
				const repStats = results.map((r) => r.cacheStats).filter((s) => s !== undefined);
				if (repStats.length > 0) {
					const avgHitRate = repStats.reduce((sum, s) => sum + s.cacheHitRate, 0) / repStats.length;
					console.log(
						pc.dim(`\n  Repetition ${rep + 1} cache hit rate: ${(avgHitRate * 100).toFixed(1)}%`),
					);
				}
			}
		}

		const totalTime = Date.now() - startTime;

		// Use last repetition results for display (most representative)
		const results = allRepetitionResults[allRepetitionResults.length - 1];

		// Display results
		displayResults(testCases, results, totalTime);

		// Calculate metrics for report
		const metrics = calculateTestMetrics(results);
		const categoryAverages = calculateCategoryAverages(results);
		const violationCounts = countViolationsByType(results);

		const combinedMetrics = {
			...metrics,
			categoryAverages,
			violationCounts,
		};

		// Generate and save results
		const report = generateMarkdownReport(results, combinedMetrics);
		const { reportPath, resultsPath } = saveEvaluationResults(results, report);

		console.log(`\nReport saved to: ${reportPath}`);
		console.log(`Detailed results saved to: ${resultsPath}`);
	} catch (error) {
		displayError('Evaluation failed', error);
	}
}