n8n/packages/@n8n/instance-ai/evaluations/cli/index.ts
Luca Mattiazzi 714981eea3
feat: Add multiple runs to instanceAI eval (no-changelog) (#28493)
Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
2026-04-22 07:11:10 +00:00

230 lines
7.5 KiB
JavaScript

#!/usr/bin/env node
import { mkdirSync, writeFileSync } from 'fs';
import { join } from 'path';
import { aggregateResults } from './aggregator';
import { parseCliArgs } from './args';
import { N8nClient } from '../clients/n8n-client';
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
import { loadWorkflowTestCases } from '../data/workflows';
import { createLogger } from '../harness/logger';
import { runWorkflowTestCase, runWithConcurrency } from '../harness/runner';
import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
import type { MultiRunEvaluation, WorkflowTestCaseResult } from '../types';
async function main(): Promise<void> {
const args = parseCliArgs(process.argv.slice(2));
const testCases = loadWorkflowTestCases(args.filter);
if (testCases.length === 0) {
console.log('No workflow test cases found in evaluations/data/workflows/');
return;
}
const totalScenarios = testCases.reduce((sum, tc) => sum + tc.scenarios.length, 0);
console.log(
`Running ${String(testCases.length)} workflow test case(s) with ${String(totalScenarios)} scenario(s) x ${String(args.runs)} runs\n`,
);
const logger = createLogger(args.verbose);
// Setup: authenticate, seed credentials, snapshot workflows
const client = new N8nClient(args.baseUrl);
logger.info(`Authenticating with ${args.baseUrl}...`);
await client.login(args.email, args.password);
logger.success('Authenticated');
logger.info('Seeding credentials...');
const seedResult = await seedCredentials(client, undefined, logger);
logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)`);
// Run test cases with bounded concurrency.
// Each test case builds a workflow (uses n8n's agent) then runs scenarios
// (uses our Anthropic key for Phase 1 + Phase 2 mock generation).
const MAX_CONCURRENT_TEST_CASES = 4;
const startTime = Date.now();
const allRunResults: WorkflowTestCaseResult[][] = [];
try {
for (let run = 0; run < args.runs; run++) {
if (args.runs > 1) {
console.log(`\n--- Run #${String(run + 1)}/${String(args.runs)} ---\n`);
}
const preRunWorkflowIds = await snapshotWorkflowIds(client);
const claimedWorkflowIds = new Set<string>();
const results = await runWithConcurrency(
testCases,
async (testCase) =>
await runWorkflowTestCase({
client,
testCase,
timeoutMs: args.timeoutMs,
seededCredentialTypes: seedResult.seededTypes,
preRunWorkflowIds,
claimedWorkflowIds,
logger,
keepWorkflows: args.keepWorkflows,
}),
MAX_CONCURRENT_TEST_CASES,
);
allRunResults.push(results);
}
} finally {
await cleanupCredentials(client, seedResult.credentialIds).catch(() => {});
}
const totalDuration = Date.now() - startTime;
const aggregatedResults = aggregateResults(allRunResults, args.runs);
// Write eval-results.json for CI consumption (PR comments, artifacts)
const outputPath = writeEvalResults(aggregatedResults, totalDuration, args.outputDir);
console.log(`Results: ${outputPath}`);
// Print console summary
printSummary(aggregatedResults);
}
/** Write structured JSON results for CI (PR comments, artifact upload). */
function writeEvalResults(
evaluation: MultiRunEvaluation,
duration: number,
outputDir?: string,
): string {
const { totalRuns, testCases } = evaluation;
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
const totalScenariosCount = allScenarios.length;
const passAtKCount =
totalScenariosCount > 0
? allScenarios.reduce((sum, s) => sum + (s.passAtK[totalRuns - 1] ?? 0), 0)
: 0;
const passHatKCount =
totalScenariosCount > 0
? allScenarios.reduce((sum, s) => sum + (s.passHatK[totalRuns - 1] ?? 0), 0)
: 0;
const report = {
timestamp: new Date().toISOString(),
duration,
totalRuns,
summary: {
testCases: testCases.length,
built: testCases.filter((tc) => tc.buildSuccessCount > 0).length,
scenariosTotal: totalScenariosCount,
passAtK: totalScenariosCount > 0 ? passAtKCount / totalScenariosCount : 0,
passHatK: totalScenariosCount > 0 ? passHatKCount / totalScenariosCount : 0,
},
testCases: testCases.map((tc) => ({
name: tc.testCase.prompt.slice(0, 70),
buildSuccessCount: tc.buildSuccessCount,
totalRuns,
scenarios: tc.scenarios.map((sa) => ({
name: sa.scenario.name,
passCount: sa.passCount,
totalRuns,
passAtK: sa.passAtK[totalRuns - 1] ?? 0,
passHatK: sa.passHatK[totalRuns - 1] ?? 0,
runs: sa.runs.map((sr) => ({
passed: sr.success,
score: sr.score,
reasoning: sr.reasoning,
failureCategory: sr.failureCategory,
rootCause: sr.rootCause,
})),
})),
})),
};
const dir = outputDir ?? process.cwd();
mkdirSync(dir, { recursive: true });
const outputPath = join(dir, 'eval-results.json');
writeFileSync(outputPath, JSON.stringify(report, null, 2));
return outputPath;
}
function printSummary(evaluation: MultiRunEvaluation): void {
const { totalRuns, testCases } = evaluation;
const multiRun = totalRuns > 1;
console.log('\n=== Workflow Eval Results ===\n');
for (const tc of testCases) {
console.log(`${tc.testCase.prompt.slice(0, 70)}...`);
if (multiRun) {
console.log(` Build: ${String(tc.buildSuccessCount)}/${String(totalRuns)} runs`);
} else {
const r = tc.runs[0];
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
console.log(` Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
if (r.buildError) {
console.log(` Error: ${r.buildError.slice(0, 200)}`);
}
}
for (const sa of tc.scenarios) {
if (multiRun) {
const passAtK = Math.round((sa.passAtK[totalRuns - 1] ?? 0) * 100);
const passHatK = Math.round((sa.passHatK[totalRuns - 1] ?? 0) * 100);
console.log(
` ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` +
` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`,
);
} else {
const sr = sa.runs[0];
const icon = sr.success ? '✓' : '✗';
console.log(
` ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'} (${String(sr.score * 100)}%)`,
);
if (!sr.success) {
console.log(` ${sr.reasoning.slice(0, 120)}`);
}
}
}
}
// Aggregate metrics for multi-run
if (multiRun) {
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
const total = allScenarios.length;
const avgPassAtK =
total > 0
? Math.round(
(allScenarios.reduce((sum, s) => sum + (s.passAtK[totalRuns - 1] ?? 0), 0) / total) *
100,
)
: 0;
const avgPassHatK =
total > 0
? Math.round(
(allScenarios.reduce((sum, s) => sum + (s.passHatK[totalRuns - 1] ?? 0), 0) / total) *
100,
)
: 0;
console.log('\n=== Aggregate Metrics ===\n');
console.log(` pass@${String(totalRuns)}: ${String(avgPassAtK)}%`);
console.log(` pass^${String(totalRuns)}: ${String(avgPassHatK)}%`);
}
// Totals
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
const total = allScenarios.length;
const built = testCases.filter((tc) => tc.buildSuccessCount > 0).length;
const passedTotal = multiRun
? allScenarios.reduce((sum, s) => sum + s.passCount, 0)
: allScenarios.filter((s) => s.runs[0]?.success).length;
const totalAttempts = multiRun ? total * totalRuns : total;
console.log(
`\n${String(built)}/${String(testCases.length)} built | ${String(passedTotal)}/${String(totalAttempts)} passed (${String(totalAttempts > 0 ? Math.round((passedTotal / totalAttempts) * 100) : 0)}%)`,
);
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});