n8n/packages/@n8n/instance-ai/evaluations/cli/index.ts
José Braulio González Valido fef91c97dd
Some checks are pending
Build: Benchmark Image / build (push) Waiting to run
CI: Master (Build, Test, Lint) / Build for Github Cache (push) Waiting to run
CI: Master (Build, Test, Lint) / Unit tests (22.x) (push) Waiting to run
CI: Master (Build, Test, Lint) / Unit tests (24.14.1) (push) Waiting to run
CI: Master (Build, Test, Lint) / Unit tests (25.x) (push) Waiting to run
CI: Master (Build, Test, Lint) / Lint (push) Waiting to run
CI: Master (Build, Test, Lint) / Performance (push) Waiting to run
CI: Master (Build, Test, Lint) / Notify Slack on failure (push) Blocked by required conditions
Util: Sync API Docs / sync-public-api (push) Waiting to run
feat(ai-builder): Add --keep-workflows flag and fix eval execution errors (no-changelog) (#28129)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-07 17:35:04 +00:00

98 lines
3.3 KiB
JavaScript

#!/usr/bin/env node
import { parseCliArgs } from './args';
import { N8nClient } from '../clients/n8n-client';
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
import { loadWorkflowTestCases } from '../data/workflows';
import { createLogger } from '../harness/logger';
import { runWorkflowTestCase, runWithConcurrency } from '../harness/runner';
import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
import { writeWorkflowReport } from '../report/workflow-report';
async function main(): Promise<void> {
const args = parseCliArgs(process.argv.slice(2));
const testCases = loadWorkflowTestCases(args.filter);
if (testCases.length === 0) {
console.log('No workflow test cases found in evaluations/data/workflows/');
return;
}
const totalScenarios = testCases.reduce((sum, tc) => sum + tc.scenarios.length, 0);
console.log(
`Running ${String(testCases.length)} workflow test case(s) with ${String(totalScenarios)} scenario(s)\n`,
);
const logger = createLogger(args.verbose);
// Setup: authenticate, seed credentials, snapshot workflows
const client = new N8nClient(args.baseUrl);
logger.info(`Authenticating with ${args.baseUrl}...`);
await client.login(args.email, args.password);
logger.success('Authenticated');
logger.info('Seeding credentials...');
const seedResult = await seedCredentials(client);
logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)`);
const preRunWorkflowIds = await snapshotWorkflowIds(client);
const claimedWorkflowIds = new Set<string>();
// Run test cases with bounded concurrency.
// Each test case builds a workflow (uses n8n's agent) then runs scenarios
// (uses our Anthropic key for Phase 1 + Phase 2 mock generation).
// At Tier 4 (20K RPM) no practical limit is needed — set high to run all in parallel.
const MAX_CONCURRENT_TEST_CASES = 4;
let results;
try {
results = await runWithConcurrency(
testCases,
async (testCase) =>
await runWorkflowTestCase({
client,
testCase,
timeoutMs: args.timeoutMs,
seededCredentialTypes: seedResult.seededTypes,
preRunWorkflowIds,
claimedWorkflowIds,
logger,
keepWorkflows: args.keepWorkflows,
}),
MAX_CONCURRENT_TEST_CASES,
);
} finally {
// Cleanup credentials even if test execution fails
await cleanupCredentials(client, seedResult.credentialIds).catch(() => {});
}
// Generate HTML report
const reportPath = writeWorkflowReport(results);
console.log(`Report: ${reportPath}`);
// Print summary
console.log('\n=== Workflow Test Case Results ===\n');
for (const r of results) {
const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
console.log(`${r.testCase.prompt.slice(0, 70)}...`);
console.log(` Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
if (r.buildError) {
console.log(` Error: ${r.buildError.slice(0, 200)}`);
}
for (const sr of r.scenarioResults) {
const icon = sr.success ? '\u2713' : '\u2717';
console.log(
` ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'} (${String(sr.score * 100)}%)`,
);
if (!sr.success) {
console.log(` ${sr.reasoning.slice(0, 120)}`);
}
}
console.log('');
}
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});