mirror of
https://github.com/n8n-io/n8n.git
synced 2026-06-03 02:07:06 +02:00
162 lines
5.7 KiB
TypeScript
162 lines
5.7 KiB
TypeScript
import pLimit from 'p-limit';
|
|
import pc from 'picocolors';
|
|
|
|
import { createProgressBar, updateProgress, displayResults, displayError } from './display.js';
|
|
import { basicTestCases, generateTestCases } from '../chains/test-case-generator.js';
|
|
import {
|
|
setupTestEnvironment,
|
|
createAgent,
|
|
getConcurrencyLimit,
|
|
shouldGenerateTestCases,
|
|
howManyTestCasesToGenerate,
|
|
} from '../core/environment.js';
|
|
import { runSingleTest, initializeTestTracking } from '../core/test-runner.js';
|
|
import type { TestCase } from '../types/evaluation.js';
|
|
import type { TestResult } from '../types/test-result.js';
|
|
import {
|
|
calculateTestMetrics,
|
|
calculateCategoryAverages,
|
|
countViolationsByType,
|
|
} from '../utils/evaluation-calculator.js';
|
|
import { formatHeader, saveEvaluationResults } from '../utils/evaluation-helpers.js';
|
|
import { generateMarkdownReport } from '../utils/evaluation-reporter.js';
|
|
|
|
type CliEvaluationOptions = {
|
|
testCaseFilter?: string; // Optional test case ID to run only a specific test
|
|
testCases?: TestCase[]; // Optional array of test cases to run (if not provided, uses defaults and generation)
|
|
repetitions?: number; // Number of times to run each test (e.g. for cache warming analysis)
|
|
};
|
|
|
|
/**
|
|
* Main CLI evaluation runner that executes all test cases in parallel
|
|
* Supports concurrency control via EVALUATION_CONCURRENCY environment variable
|
|
*/
|
|
export async function runCliEvaluation(options: CliEvaluationOptions = {}): Promise<void> {
|
|
const { repetitions = 1, testCaseFilter } = options;
|
|
|
|
console.log(formatHeader('AI Workflow Builder Full Evaluation', 70));
|
|
if (repetitions > 1) {
|
|
console.log(pc.yellow(`➔ Each test will be run ${repetitions} times for cache analysis`));
|
|
}
|
|
console.log();
|
|
try {
|
|
// Setup test environment
|
|
const { parsedNodeTypes, llm, tracer } = await setupTestEnvironment();
|
|
|
|
// Determine test cases to run
|
|
const providedTestCases =
|
|
options.testCases && options.testCases.length > 0 ? options.testCases : undefined;
|
|
|
|
let testCases: TestCase[] = providedTestCases ?? basicTestCases;
|
|
|
|
if (providedTestCases) {
|
|
console.log(pc.blue(`➔ Loaded ${providedTestCases.length} test cases from CSV`));
|
|
}
|
|
|
|
// Filter to single test case if specified
|
|
if (testCaseFilter) {
|
|
const filteredCase = testCases.find((tc) => tc.id === testCaseFilter);
|
|
if (filteredCase) {
|
|
testCases = [filteredCase];
|
|
console.log(pc.blue(`➔ Running single test case: ${filteredCase.name}`));
|
|
} else {
|
|
console.log(pc.red(`❌ Test case '${testCaseFilter}' not found`));
|
|
console.log(pc.dim(`Available test cases: ${testCases.map((tc) => tc.id).join(', ')}`));
|
|
return;
|
|
}
|
|
} else {
|
|
// Optionally generate additional test cases
|
|
if (!providedTestCases && shouldGenerateTestCases()) {
|
|
console.log(pc.blue('➔ Generating additional test cases...'));
|
|
const generatedCases = await generateTestCases(llm, howManyTestCasesToGenerate());
|
|
testCases = [...testCases, ...generatedCases];
|
|
}
|
|
}
|
|
|
|
// Get concurrency from environment
|
|
const concurrency = getConcurrencyLimit();
|
|
console.log(pc.dim(`Running ${testCases.length} test cases with concurrency=${concurrency}`));
|
|
console.log();
|
|
|
|
const startTime = Date.now();
|
|
const allRepetitionResults: TestResult[][] = [];
|
|
|
|
// Run tests for each repetition
|
|
for (let rep = 0; rep < repetitions; rep++) {
|
|
if (repetitions > 1) {
|
|
console.log(pc.cyan(`\n═══ Repetition ${rep + 1}/${repetitions} ═══\n`));
|
|
}
|
|
|
|
// Create progress bar for this repetition
|
|
const progressBar = createProgressBar(testCases.length);
|
|
|
|
// Create concurrency limiter
|
|
const limit = pLimit(concurrency);
|
|
|
|
// Track progress
|
|
let completed = 0;
|
|
const testResults = initializeTestTracking(testCases);
|
|
|
|
// Run all test cases in parallel with concurrency limit
|
|
const promises = testCases.map(
|
|
async (testCase) =>
|
|
await limit(async () => {
|
|
updateProgress(progressBar, completed, testCases.length, `Running: ${testCase.name}`);
|
|
|
|
// Create a dedicated agent for this test to avoid state conflicts
|
|
const testAgent = createAgent(parsedNodeTypes, llm, tracer);
|
|
const result = await runSingleTest(testAgent, llm, testCase, parsedNodeTypes);
|
|
|
|
testResults[testCase.id] = result.error ? 'fail' : 'pass';
|
|
completed++;
|
|
updateProgress(progressBar, completed, testCases.length);
|
|
return result;
|
|
}),
|
|
);
|
|
|
|
const results = await Promise.all(promises);
|
|
progressBar.stop();
|
|
allRepetitionResults.push(results);
|
|
|
|
// Show brief stats for this repetition if running multiple times
|
|
if (repetitions > 1) {
|
|
const repStats = results.map((r) => r.cacheStats).filter((s) => s !== undefined);
|
|
if (repStats.length > 0) {
|
|
const avgHitRate = repStats.reduce((sum, s) => sum + s.cacheHitRate, 0) / repStats.length;
|
|
console.log(
|
|
pc.dim(`\n Repetition ${rep + 1} cache hit rate: ${(avgHitRate * 100).toFixed(1)}%`),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
const totalTime = Date.now() - startTime;
|
|
|
|
// Use last repetition results for display (most representative)
|
|
const results = allRepetitionResults[allRepetitionResults.length - 1];
|
|
|
|
// Display results
|
|
displayResults(testCases, results, totalTime);
|
|
|
|
// Calculate metrics for report
|
|
const metrics = calculateTestMetrics(results);
|
|
const categoryAverages = calculateCategoryAverages(results);
|
|
const violationCounts = countViolationsByType(results);
|
|
|
|
const combinedMetrics = {
|
|
...metrics,
|
|
categoryAverages,
|
|
violationCounts,
|
|
};
|
|
|
|
// Generate and save results
|
|
const report = generateMarkdownReport(results, combinedMetrics);
|
|
const { reportPath, resultsPath } = saveEvaluationResults(results, report);
|
|
|
|
console.log(`\nReport saved to: ${reportPath}`);
|
|
console.log(`Detailed results saved to: ${resultsPath}`);
|
|
} catch (error) {
|
|
displayError('Evaluation failed', error);
|
|
}
|
|
}
|