n8n/packages/@n8n/instance-ai/evaluations/discovery/cli.ts

270 lines
8.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
// ---------------------------------------------------------------------------
// CLI for browser/computer-use discovery evaluation.
//
// Usage:
// pnpm eval:discovery # run all scenarios, 3 trials each
// pnpm eval:discovery --filter slack-oauth --verbose
// pnpm eval:discovery --trials 5
//
// Loads scenarios from evaluations/data/discovery/, runs each scenario × N
// trials via the in-process runner, reports per-scenario pass-rates, exits
// non-zero on any scenario below threshold.
// ---------------------------------------------------------------------------
import { runDiscoveryScenario, type DiscoveryRunResult } from './runner';
import type { DiscoveryTestCase } from './types';
import { loadDiscoveryTestCasesWithFiles } from '../data/discovery';
// ---------------------------------------------------------------------------
// Args
// ---------------------------------------------------------------------------
interface CliArgs {
filter?: string;
verbose: boolean;
trials: number;
passThreshold: number;
timeoutMs: number;
maxSteps: number;
modelId: string;
concurrency: number;
nodesJsonPath?: string;
}
const DEFAULT_MODEL = process.env.N8N_INSTANCE_AI_EVAL_MODEL ?? 'anthropic/claude-sonnet-4-6';
function previewArgs(args: Record<string, unknown>): string {
const entries = Object.entries(args).slice(0, 3);
if (entries.length === 0) return '';
return entries
.map(([k, v]) => {
let preview: string;
if (typeof v === 'string') {
preview = v.length > 40 ? `"${v.slice(0, 40)}…"` : `"${v}"`;
} else if (typeof v === 'number' || typeof v === 'boolean' || v === null) {
preview = String(v);
} else if (Array.isArray(v)) {
preview = `[${String(v.length)}]`;
} else {
preview = '{…}';
}
return `${k}=${preview}`;
})
.join(', ');
}
function requirePositiveInt(raw: string | undefined, flag: string): number {
const n = Number(raw);
if (!Number.isFinite(n) || n < 1 || !Number.isInteger(n)) {
console.error(`Invalid value for ${flag}: ${String(raw)} (expected a positive integer)`);
process.exit(1);
}
return n;
}
function requireNumberInRange(
raw: string | undefined,
flag: string,
lo: number,
hi: number,
): number {
const n = Number(raw);
if (!Number.isFinite(n) || n < lo || n > hi) {
console.error(`Invalid value for ${flag}: ${String(raw)} (expected number in [${lo}, ${hi}])`);
process.exit(1);
}
return n;
}
function parseArgs(argv: string[]): CliArgs {
const args: CliArgs = {
verbose: false,
trials: 3,
passThreshold: 2 / 3,
timeoutMs: 60_000,
maxSteps: 5,
modelId: DEFAULT_MODEL,
concurrency: 3,
};
for (let i = 0; i < argv.length; i++) {
const arg = argv[i];
switch (arg) {
case '--verbose':
case '-v':
args.verbose = true;
break;
case '--filter':
args.filter = argv[++i];
break;
case '--trials':
args.trials = requirePositiveInt(argv[++i], '--trials');
break;
case '--pass-threshold':
args.passThreshold = requireNumberInRange(argv[++i], '--pass-threshold', 0, 1);
break;
case '--timeout':
args.timeoutMs = requirePositiveInt(argv[++i], '--timeout');
break;
case '--max-steps':
args.maxSteps = requirePositiveInt(argv[++i], '--max-steps');
break;
case '--model':
args.modelId = argv[++i];
break;
case '--concurrency':
args.concurrency = requirePositiveInt(argv[++i], '--concurrency');
break;
case '--nodes-json':
args.nodesJsonPath = argv[++i];
break;
default:
break;
}
}
return args;
}
// ---------------------------------------------------------------------------
// Local mode
// ---------------------------------------------------------------------------
interface ScenarioAggregate {
scenario: DiscoveryTestCase;
results: DiscoveryRunResult[];
passCount: number;
passRate: number;
}
async function runLocalMode(args: CliArgs): Promise<void> {
const cases = loadDiscoveryTestCasesWithFiles(args.filter);
if (cases.length === 0) {
console.log('No discovery scenarios found.');
return;
}
console.log(
`Running ${String(cases.length)} discovery scenario(s) × ${String(args.trials)} trial(s) (model: ${args.modelId}, concurrency: ${String(args.concurrency)}).\n`,
);
const aggregates: ScenarioAggregate[] = [];
// Per scenario, run trials in parallel up to `concurrency`. Across scenarios, run sequentially
// — keeps the load profile predictable and per-scenario reports atomic.
for (const { testCase, fileSlug } of cases) {
process.stdout.write(`${fileSlug} ... `);
const trialResults: DiscoveryRunResult[] = [];
for (let i = 0; i < args.trials; i += args.concurrency) {
const batchSize = Math.min(args.concurrency, args.trials - i);
const batch = await Promise.all(
Array.from(
{ length: batchSize },
async () =>
await runDiscoveryScenario({
scenario: testCase,
modelId: args.modelId,
maxSteps: args.maxSteps,
timeoutMs: args.timeoutMs,
...(args.nodesJsonPath ? { nodesJsonPath: args.nodesJsonPath } : {}),
}),
),
);
trialResults.push(...batch);
}
const passCount = trialResults.filter((r) => r.check.pass).length;
const passRate = passCount / trialResults.length;
const status = passRate >= args.passThreshold ? '✓' : '✗';
console.log(
`${status} ${String(passCount)}/${String(trialResults.length)} passed (${(passRate * 100).toFixed(0)}%)`,
);
if (args.verbose) {
for (const [idx, r] of trialResults.entries()) {
const icon = r.check.pass ? ' ✓' : ' ✗';
const trial = `trial ${String(idx + 1)}`;
console.log(`${icon} ${trial} (${(r.durationMs / 1000).toFixed(1)}s) — ${r.check.comment}`);
if (r.runError) console.log(` run error: ${r.runError}`);
if (r.outcome.toolCalls.length > 0) {
console.log(' tool calls:');
for (const tc of r.outcome.toolCalls) {
const argsPreview = previewArgs(tc.args);
console.log(`${tc.toolName}(${argsPreview})`);
}
}
if (r.outcome.agentActivities.length > 0) {
console.log(' spawned sub-agents:');
for (const a of r.outcome.agentActivities) {
console.log(` • role=${a.role}, tools=[${a.tools.join(', ')}]`);
}
}
}
}
aggregates.push({ scenario: testCase, results: trialResults, passCount, passRate });
}
printSummary(aggregates, args);
const failingScenarios = aggregates.filter((a) => a.passRate < args.passThreshold);
if (failingScenarios.length > 0) {
process.exitCode = 1;
}
}
function printSummary(aggregates: ScenarioAggregate[], args: CliArgs): void {
console.log('\n=== Summary ===');
const totalTrials = aggregates.reduce((sum, a) => sum + a.results.length, 0);
const totalPasses = aggregates.reduce((sum, a) => sum + a.passCount, 0);
const passingScenarios = aggregates.filter((a) => a.passRate >= args.passThreshold).length;
const totalDurationMs = aggregates
.flatMap((a) => a.results)
.reduce((sum, r) => sum + r.durationMs, 0);
console.log(
`Scenarios: ${String(passingScenarios)}/${String(aggregates.length)} above threshold (${(args.passThreshold * 100).toFixed(0)}%)`,
);
console.log(
`Trials: ${String(totalPasses)}/${String(totalTrials)} passed (${((totalPasses / totalTrials) * 100).toFixed(0)}%)`,
);
console.log(`Total time: ${(totalDurationMs / 1000).toFixed(1)}s`);
const failingScenarios = aggregates.filter((a) => a.passRate < args.passThreshold);
if (failingScenarios.length > 0) {
console.log('\nFailing scenarios:');
for (const a of failingScenarios) {
console.log(`${a.scenario.id} (${(a.passRate * 100).toFixed(0)}%)`);
const firstFailure = a.results.find((r) => !r.check.pass);
if (firstFailure) {
console.log(` ${firstFailure.check.comment}`);
}
}
}
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main(): Promise<void> {
const args = parseArgs(process.argv.slice(2));
if (!process.env.ANTHROPIC_API_KEY) {
console.error(
'Error: ANTHROPIC_API_KEY is required to run discovery evaluations (the runner calls Anthropic in-process).',
);
process.exit(1);
}
await runLocalMode(args);
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});