mirror of
https://github.com/n8n-io/n8n.git
synced 2026-05-27 14:57:21 +02:00
373 lines
11 KiB
JavaScript
373 lines
11 KiB
JavaScript
#!/usr/bin/env node
|
|
// ---------------------------------------------------------------------------
|
|
// CLI for isolated sub-agent evaluation
|
|
//
|
|
// Usage:
|
|
// pnpm eval:subagent --verbose
|
|
// pnpm eval:subagent --filter webhook --verbose
|
|
// pnpm eval:subagent --prompt "Build a webhook workflow" --subagent builder
|
|
// pnpm eval:subagent --dataset my-dataset --experiment my-exp --verbose
|
|
// ---------------------------------------------------------------------------
|
|
|
|
import { Client } from 'langsmith/client';
|
|
import { evaluate } from 'langsmith/evaluation';
|
|
import { readFileSync, readdirSync } from 'node:fs';
|
|
import { join, basename } from 'node:path';
|
|
|
|
import { createFeedbackExtractor, mapExampleToTestCase } from './langsmith';
|
|
import { runSubAgent, type RunSubAgentDeps } from './runner';
|
|
import type { SubAgentTestCase, SubAgentRunnerConfig, SubAgentResult } from './types';
|
|
import { N8nClient } from '../clients/n8n-client';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Args
|
|
// ---------------------------------------------------------------------------
|
|
|
|
interface CliArgs {
|
|
filter?: string;
|
|
verbose: boolean;
|
|
timeoutMs: number;
|
|
maxSteps: number;
|
|
modelId?: string;
|
|
subagent: string;
|
|
prompt?: string;
|
|
dataset?: string;
|
|
experiment?: string;
|
|
concurrency: number;
|
|
baseUrl: string;
|
|
keepWorkflows: boolean;
|
|
}
|
|
|
|
function requirePositiveInt(raw: string | undefined, flag: string): number {
|
|
const n = Number(raw);
|
|
if (!Number.isFinite(n) || n < 1 || !Number.isInteger(n)) {
|
|
console.error(`Invalid value for ${flag}: ${String(raw)} (expected a positive integer)`);
|
|
process.exit(1);
|
|
}
|
|
return n;
|
|
}
|
|
|
|
function parseArgs(argv: string[]): CliArgs {
|
|
const args: CliArgs = {
|
|
verbose: false,
|
|
timeoutMs: 120_000,
|
|
maxSteps: 40,
|
|
modelId: process.env.N8N_INSTANCE_AI_EVAL_MODEL,
|
|
subagent: 'builder',
|
|
concurrency: 5,
|
|
baseUrl: process.env.N8N_EVAL_BASE_URL ?? 'http://localhost:5678',
|
|
keepWorkflows: false,
|
|
};
|
|
|
|
for (let i = 0; i < argv.length; i++) {
|
|
const arg = argv[i];
|
|
switch (arg) {
|
|
case '--verbose':
|
|
case '-v':
|
|
args.verbose = true;
|
|
break;
|
|
case '--filter':
|
|
args.filter = argv[++i];
|
|
break;
|
|
case '--timeout':
|
|
args.timeoutMs = requirePositiveInt(argv[++i], '--timeout');
|
|
break;
|
|
case '--max-steps':
|
|
args.maxSteps = requirePositiveInt(argv[++i], '--max-steps');
|
|
break;
|
|
case '--model':
|
|
args.modelId = argv[++i];
|
|
break;
|
|
case '--subagent':
|
|
args.subagent = argv[++i];
|
|
break;
|
|
case '--prompt':
|
|
args.prompt = argv[++i];
|
|
break;
|
|
case '--dataset':
|
|
args.dataset = argv[++i];
|
|
break;
|
|
case '--experiment':
|
|
args.experiment = argv[++i];
|
|
break;
|
|
case '--concurrency':
|
|
args.concurrency = requirePositiveInt(argv[++i], '--concurrency');
|
|
break;
|
|
case '--base-url':
|
|
args.baseUrl = argv[++i];
|
|
break;
|
|
case '--keep-workflows':
|
|
args.keepWorkflows = true;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
return args;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Dataset loading (local JSON files)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const DATA_DIR = join(__dirname, '..', 'data', 'subagent');
|
|
|
|
function loadLocalTestCases(filter?: string, subagent?: string): SubAgentTestCase[] {
|
|
let files: string[];
|
|
try {
|
|
files = readdirSync(DATA_DIR).filter((f) => f.endsWith('.json'));
|
|
} catch {
|
|
console.error(`No test cases found in ${DATA_DIR}`);
|
|
return [];
|
|
}
|
|
|
|
if (filter) {
|
|
files = files.filter((f) => f.includes(filter));
|
|
}
|
|
|
|
const cases: SubAgentTestCase[] = [];
|
|
for (const file of files) {
|
|
const raw = readFileSync(join(DATA_DIR, file), 'utf-8');
|
|
let parsed: {
|
|
id?: string;
|
|
prompt: string;
|
|
subagent?: string;
|
|
systemPrompt?: string;
|
|
tools?: string[];
|
|
maxSteps?: number;
|
|
annotations?: Record<string, unknown>;
|
|
};
|
|
try {
|
|
parsed = JSON.parse(raw) as typeof parsed;
|
|
} catch {
|
|
console.error(`Failed to parse ${file}`);
|
|
continue;
|
|
}
|
|
const tc: SubAgentTestCase = {
|
|
id: parsed.id ?? basename(file, '.json'),
|
|
prompt: parsed.prompt,
|
|
};
|
|
const resolvedSubagent = parsed.subagent ?? subagent;
|
|
if (resolvedSubagent) tc.subagent = resolvedSubagent;
|
|
if (parsed.systemPrompt) tc.systemPrompt = parsed.systemPrompt;
|
|
if (parsed.tools) tc.tools = parsed.tools;
|
|
if (parsed.maxSteps) tc.maxSteps = parsed.maxSteps;
|
|
if (parsed.annotations) tc.annotations = parsed.annotations;
|
|
cases.push(tc);
|
|
}
|
|
return cases;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Display helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function truncate(text: string, maxLen: number): string {
|
|
return text.length > maxLen ? text.slice(0, maxLen) + '...' : text;
|
|
}
|
|
|
|
function printResult(result: SubAgentResult, verbose: boolean): void {
|
|
const { testCase, capturedWorkflows, feedback, durationMs, error } = result;
|
|
const secs = (durationMs / 1000).toFixed(1);
|
|
|
|
const workflowCount = capturedWorkflows.length;
|
|
const binaryPassRate = feedback.find(
|
|
(f) => f.evaluator === 'binary-checks' && f.metric === 'pass_rate',
|
|
);
|
|
const produced = feedback.find((f) => f.metric === 'workflow_produced');
|
|
|
|
const statusIcon = error ? '\u2717' : produced?.score === 1 ? '\u2713' : '\u25CB';
|
|
const passRateStr = binaryPassRate ? `${(binaryPassRate.score * 100).toFixed(0)}%` : 'N/A';
|
|
|
|
console.log(
|
|
`${statusIcon} ${testCase.id} (${secs}s) — ${String(workflowCount)} workflow(s), binary checks: ${passRateStr}`,
|
|
);
|
|
|
|
if (error) {
|
|
console.log(` Error: ${truncate(error, 200)}`);
|
|
}
|
|
|
|
if (verbose) {
|
|
const checks = feedback.filter((f) => f.evaluator === 'binary-checks' && f.kind === 'metric');
|
|
for (const check of checks) {
|
|
const icon = check.score === 1 ? ' \u2713' : ' \u2717';
|
|
const comment = check.comment ? ` — ${check.comment}` : '';
|
|
console.log(`${icon} ${check.metric}${comment}`);
|
|
}
|
|
|
|
if (result.capturedWorkflows.length > 0) {
|
|
console.log(' Workflow: ', result.capturedWorkflows[0].json);
|
|
}
|
|
|
|
if (result.text) {
|
|
console.log(` Agent: ${truncate(result.text, 300)}`);
|
|
}
|
|
console.log('');
|
|
}
|
|
}
|
|
|
|
function printSummary(results: SubAgentResult[]): void {
|
|
const passed = results.filter((r) => !r.error && r.capturedWorkflows.length > 0).length;
|
|
const failed = results.length - passed;
|
|
const avgDuration = results.reduce((sum, r) => sum + r.durationMs, 0) / results.length;
|
|
|
|
console.log('\n=== Summary ===');
|
|
console.log(
|
|
`Total: ${String(results.length)}, Produced workflow: ${String(passed)}, Failed: ${String(failed)}`,
|
|
);
|
|
console.log(`Average duration: ${(avgDuration / 1000).toFixed(1)}s`);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// LangSmith mode
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function runLangsmithMode(
|
|
args: CliArgs,
|
|
config: SubAgentRunnerConfig,
|
|
deps: RunSubAgentDeps,
|
|
): Promise<void> {
|
|
const apiKey = process.env.LANGSMITH_API_KEY;
|
|
if (!apiKey) {
|
|
console.error('Error: LANGSMITH_API_KEY is required for --dataset mode');
|
|
process.exit(1);
|
|
}
|
|
|
|
const lsClient = new Client({ apiKey });
|
|
|
|
const target = async (inputs: Record<string, unknown>) => {
|
|
const testCase = mapExampleToTestCase(inputs);
|
|
testCase.subagent ??= args.subagent;
|
|
const result = await runSubAgent(testCase, config, deps);
|
|
|
|
return {
|
|
prompt: testCase.prompt,
|
|
subagent: testCase.subagent ?? 'builder',
|
|
text: result.text,
|
|
workflow: result.capturedWorkflows[0]?.json ?? null,
|
|
feedback: result.feedback,
|
|
durationMs: result.durationMs,
|
|
error: result.error,
|
|
};
|
|
};
|
|
|
|
console.log('Running LangSmith evaluation:');
|
|
console.log(` Dataset: ${args.dataset!}`);
|
|
console.log(` Experiment: ${args.experiment ?? '(auto-generated)'}`);
|
|
console.log(` Sub-agent: ${args.subagent}`);
|
|
console.log(` Concurrency: ${String(args.concurrency)}`);
|
|
console.log('');
|
|
|
|
const experimentResults = await evaluate(target, {
|
|
data: args.dataset!,
|
|
evaluators: [createFeedbackExtractor()],
|
|
experimentPrefix: args.experiment,
|
|
maxConcurrency: args.concurrency,
|
|
client: lsClient,
|
|
metadata: {
|
|
subagent: args.subagent,
|
|
modelId: config.modelId,
|
|
maxSteps: config.maxSteps,
|
|
timeoutMs: config.timeoutMs,
|
|
},
|
|
});
|
|
|
|
await lsClient.awaitPendingTraceBatches();
|
|
|
|
const experimentName =
|
|
'experimentName' in experimentResults
|
|
? String(experimentResults.experimentName)
|
|
: (args.experiment ?? 'unknown');
|
|
|
|
console.log(`\nExperiment complete: ${experimentName}`);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Local mode (sequential, with optional single prompt)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function runLocalMode(
|
|
args: CliArgs,
|
|
config: SubAgentRunnerConfig,
|
|
deps: RunSubAgentDeps,
|
|
): Promise<void> {
|
|
let testCases: SubAgentTestCase[];
|
|
|
|
if (args.prompt) {
|
|
testCases = [
|
|
{
|
|
id: 'cli-prompt',
|
|
prompt: args.prompt,
|
|
subagent: args.subagent,
|
|
},
|
|
];
|
|
} else {
|
|
testCases = loadLocalTestCases(args.filter, args.subagent);
|
|
}
|
|
|
|
if (testCases.length === 0) {
|
|
console.log('No test cases found.');
|
|
return;
|
|
}
|
|
|
|
console.log(
|
|
`Running ${String(testCases.length)} sub-agent test case(s) with model ${config.modelId ?? '<server default>'} (concurrency: ${String(args.concurrency)})\n`,
|
|
);
|
|
|
|
const results: SubAgentResult[] = [];
|
|
|
|
// Concurrency=1 falls through the same batched path (batch size 1, strictly sequential).
|
|
for (let i = 0; i < testCases.length; i += args.concurrency) {
|
|
const batch = testCases.slice(i, i + args.concurrency);
|
|
|
|
if (args.verbose) {
|
|
for (const tc of batch) {
|
|
console.log(`Starting: ${tc.id} — ${truncate(tc.prompt, 80)}`);
|
|
}
|
|
}
|
|
|
|
const batchResults = await Promise.all(
|
|
batch.map(async (testCase) => await runSubAgent(testCase, config, deps)),
|
|
);
|
|
|
|
for (const result of batchResults) {
|
|
results.push(result);
|
|
printResult(result, args.verbose);
|
|
}
|
|
}
|
|
|
|
printSummary(results);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function main(): Promise<void> {
|
|
const args = parseArgs(process.argv.slice(2));
|
|
|
|
const config: SubAgentRunnerConfig = {
|
|
modelId: args.modelId,
|
|
timeoutMs: args.timeoutMs,
|
|
maxSteps: args.maxSteps,
|
|
verbose: args.verbose,
|
|
};
|
|
|
|
const client = new N8nClient(args.baseUrl);
|
|
await client.login();
|
|
|
|
const deps: RunSubAgentDeps = { client, deleteAfterRun: !args.keepWorkflows };
|
|
|
|
if (args.dataset) {
|
|
await runLangsmithMode(args, config, deps);
|
|
} else {
|
|
await runLocalMode(args, config, deps);
|
|
}
|
|
}
|
|
|
|
main().catch((error) => {
|
|
console.error('Fatal error:', error);
|
|
process.exit(1);
|
|
});
|