n8n/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/argument-parser.ts
Eugene 00014420b1
refactor(core): Remove multi-agent architecture entry point from AI workflow builder (no-changelog) (#27925)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 16:32:19 +00:00

631 lines
16 KiB
TypeScript

/* eslint-disable @typescript-eslint/naming-convention */
import { z } from 'zod';
import { AVAILABLE_MODELS, DEFAULT_MODEL, type ModelId } from '@/llm-config';
import type { BuilderFeatureFlags } from '@/workflow-builder-agent';
import type { LangsmithExampleFilters } from '../harness/harness-types';
import { DEFAULTS } from '../support/constants';
import type { StageModels } from '../support/environment';
export type EvaluationSuite =
| 'llm-judge'
| 'pairwise'
| 'programmatic'
| 'similarity'
| 'introspection'
| 'binary-checks';
export type EvaluationBackend = 'local' | 'langsmith';
export type AgentType = 'multi-agent' | 'code-builder';
export interface EvaluationArgs {
suite: EvaluationSuite;
backend: EvaluationBackend;
agent: AgentType;
verbose: boolean;
repetitions: number;
concurrency: number;
timeoutMs: number;
experimentName?: string;
outputDir?: string;
datasetName?: string;
maxExamples?: number;
filters?: LangsmithExampleFilters;
testCase?: string;
promptsCsv?: string;
prompt?: string;
dos?: string;
donts?: string;
numJudges: number;
featureFlags?: BuilderFeatureFlags;
/** URL to POST evaluation results to when complete */
webhookUrl?: string;
/** Secret for HMAC-SHA256 signature of webhook payload */
webhookSecret?: string;
/** CSV file path for evaluation results */
outputCsv?: string;
/** Comma-separated list of binary check names to run */
checks?: string[];
// Model configuration
/** Default model for all stages */
model: ModelId;
/** Model for LLM judge evaluation */
judgeModel?: ModelId;
/** Model for supervisor stage */
supervisorModel?: ModelId;
/** Model for responder stage */
responderModel?: ModelId;
/** Model for discovery stage */
discoveryModel?: ModelId;
/** Model for builder stage (structure and configuration) */
builderModel?: ModelId;
/** Model for parameter updater (within builder) */
parameterUpdaterModel?: ModelId;
}
type CliValueKind = 'boolean' | 'string';
type FlagGroup =
| 'input'
| 'eval'
| 'pairwise'
| 'langsmith'
| 'output'
| 'feature'
| 'model'
| 'advanced';
// Model ID validation schema
const modelIdSchema = z.enum(AVAILABLE_MODELS as [ModelId, ...ModelId[]]);
const cliSchema = z
.object({
suite: z
.enum([
'llm-judge',
'pairwise',
'programmatic',
'similarity',
'introspection',
'binary-checks',
])
.default('llm-judge'),
backend: z.enum(['local', 'langsmith']).default('local'),
agent: z.enum(['code-builder', 'multi-agent']).default('code-builder'),
verbose: z.boolean().default(false),
repetitions: z.coerce.number().int().positive().default(DEFAULTS.REPETITIONS),
concurrency: z.coerce.number().int().positive().default(DEFAULTS.CONCURRENCY),
timeoutMs: z.coerce.number().int().positive().default(DEFAULTS.TIMEOUT_MS),
experimentName: z.string().min(1).optional(),
outputDir: z.string().min(1).optional(),
outputCsv: z.string().min(1).optional(),
datasetName: z.string().min(1).optional(),
maxExamples: z.coerce.number().int().positive().optional(),
filter: z.array(z.string().min(1)).default([]),
notionId: z.string().min(1).optional(),
technique: z.string().min(1).optional(),
testCase: z.string().min(1).optional(),
promptsCsv: z.string().min(1).optional(),
prompt: z.string().min(1).optional(),
dos: z.string().min(1).optional(),
donts: z.string().min(1).optional(),
numJudges: z.coerce.number().int().positive().default(DEFAULTS.NUM_JUDGES),
checks: z.string().min(1).optional(),
langsmith: z.boolean().optional(),
templateExamples: z.boolean().default(false),
webhookUrl: z.string().url().optional(),
webhookSecret: z.string().min(16).optional(),
// Model configuration
model: modelIdSchema.default(DEFAULT_MODEL),
judgeModel: modelIdSchema.optional(),
supervisorModel: modelIdSchema.optional(),
responderModel: modelIdSchema.optional(),
discoveryModel: modelIdSchema.optional(),
builderModel: modelIdSchema.optional(),
parameterUpdaterModel: modelIdSchema.optional(),
})
.strict();
type CliKey = keyof z.infer<typeof cliSchema>;
type FlagDef = { key: CliKey; kind: CliValueKind; desc: string; group: FlagGroup };
const FLAG_DEFS: Record<string, FlagDef> = {
// Input sources
'--prompt': { key: 'prompt', kind: 'string', group: 'input', desc: 'Single prompt to evaluate' },
'--prompts-csv': {
key: 'promptsCsv',
kind: 'string',
group: 'input',
desc: 'CSV file with prompts',
},
'--test-case': {
key: 'testCase',
kind: 'string',
group: 'input',
desc: 'Run specific default test case by ID',
},
'--dataset': {
key: 'datasetName',
kind: 'string',
group: 'input',
desc: 'LangSmith dataset name',
},
// Evaluation options
'--suite': {
key: 'suite',
kind: 'string',
group: 'eval',
desc: 'Evaluation suite (llm-judge|pairwise|programmatic|similarity|introspection|binary-checks)',
},
'--checks': {
key: 'checks',
kind: 'string',
group: 'eval',
desc: 'Comma-separated binary check names to run (binary-checks suite only)',
},
'--backend': { key: 'backend', kind: 'string', group: 'eval', desc: 'Backend (local|langsmith)' },
'--agent': {
key: 'agent',
kind: 'string',
group: 'eval',
desc: 'Agent type (code-builder|multi-agent)',
},
'--max-examples': {
key: 'maxExamples',
kind: 'string',
group: 'eval',
desc: 'Limit number of examples',
},
'--repetitions': {
key: 'repetitions',
kind: 'string',
group: 'eval',
desc: 'Repeat each example N times',
},
'--concurrency': {
key: 'concurrency',
kind: 'string',
group: 'eval',
desc: 'Max parallel evaluations',
},
'--timeout-ms': {
key: 'timeoutMs',
kind: 'string',
group: 'eval',
desc: 'Timeout per evaluation (ms)',
},
// Pairwise options
'--dos': {
key: 'dos',
kind: 'string',
group: 'pairwise',
desc: 'Requirements the workflow must satisfy',
},
'--donts': {
key: 'donts',
kind: 'string',
group: 'pairwise',
desc: 'Things the workflow must avoid',
},
// LangSmith options
'--langsmith': {
key: 'langsmith',
kind: 'boolean',
group: 'langsmith',
desc: 'Shorthand for --backend langsmith',
},
'--name': { key: 'experimentName', kind: 'string', group: 'langsmith', desc: 'Experiment name' },
'--filter': {
key: 'filter',
kind: 'string',
group: 'langsmith',
desc: 'Filter examples (key:value, repeatable)',
},
'--notion-id': {
key: 'notionId',
kind: 'string',
group: 'langsmith',
desc: 'Filter by Notion ID',
},
'--technique': {
key: 'technique',
kind: 'string',
group: 'langsmith',
desc: 'Filter by technique',
},
// Output
'--output-dir': {
key: 'outputDir',
kind: 'string',
group: 'output',
desc: 'Directory for artifacts',
},
'--output-csv': {
key: 'outputCsv',
kind: 'string',
group: 'output',
desc: 'CSV file for evaluation results - if pre-existing file found it will be overwritten',
},
'--verbose': { key: 'verbose', kind: 'boolean', group: 'output', desc: 'Verbose logging' },
'--webhook-url': {
key: 'webhookUrl',
kind: 'string',
group: 'output',
desc: 'URL to POST results to when complete',
},
'--webhook-secret': {
key: 'webhookSecret',
kind: 'string',
group: 'output',
desc: 'Secret for HMAC-SHA256 signature (min 16 chars)',
},
// Feature flags
'--template-examples': {
key: 'templateExamples',
kind: 'boolean',
group: 'feature',
desc: 'Enable template examples phase',
},
// Model configuration
'--model': {
key: 'model',
kind: 'string',
group: 'model',
desc: `Default model for all stages (default: ${DEFAULT_MODEL})`,
},
'--judge-model': {
key: 'judgeModel',
kind: 'string',
group: 'model',
desc: 'Model for LLM judge evaluation',
},
'--supervisor-model': {
key: 'supervisorModel',
kind: 'string',
group: 'model',
desc: 'Model for supervisor stage',
},
'--responder-model': {
key: 'responderModel',
kind: 'string',
group: 'model',
desc: 'Model for responder stage',
},
'--discovery-model': {
key: 'discoveryModel',
kind: 'string',
group: 'model',
desc: 'Model for discovery stage',
},
'--builder-model': {
key: 'builderModel',
kind: 'string',
group: 'model',
desc: 'Model for builder stage (structure and configuration)',
},
'--parameter-updater-model': {
key: 'parameterUpdaterModel',
kind: 'string',
group: 'model',
desc: 'Model for parameter updater',
},
// Advanced
'--judges': { key: 'numJudges', kind: 'string', group: 'advanced', desc: 'Number of LLM judges' },
};
// Aliases (not shown in help)
const FLAG_ALIASES: Record<string, string> = {
'--mode': '--suite',
'-v': '--verbose',
};
// Combined lookup for parsing
const FLAG_TO_KEY: Record<string, FlagDef> = {
...FLAG_DEFS,
...Object.fromEntries(
Object.entries(FLAG_ALIASES).map(([alias, target]) => [alias, FLAG_DEFS[target]]),
),
};
function formatValidFlags(): string {
return Object.keys(FLAG_TO_KEY)
.filter((f) => f.startsWith('--'))
.sort()
.join('\n ');
}
const GROUP_TITLES: Record<FlagGroup, string> = {
input: 'Input Sources',
eval: 'Evaluation Options',
pairwise: 'Pairwise Options',
langsmith: 'LangSmith Options',
output: 'Output',
feature: 'Feature Flags',
model: 'Model Configuration',
advanced: 'Advanced',
};
function formatHelp(): string {
const lines: string[] = [
'Usage: pnpm eval [options]',
'',
'Evaluation harness for AI Workflow Builder.',
'',
];
const groups: FlagGroup[] = [
'input',
'eval',
'pairwise',
'langsmith',
'output',
'feature',
'model',
'advanced',
];
for (const group of groups) {
const flags = Object.entries(FLAG_DEFS).filter(([, def]) => def.group === group);
if (flags.length === 0) continue;
lines.push(`${GROUP_TITLES[group]}:`);
for (const [flag, def] of flags) {
const valueHint = def.kind === 'string' ? ' <value>' : '';
const padded = ` ${flag}${valueHint}`.padEnd(28);
lines.push(`${padded}${def.desc}`);
}
lines.push('');
}
lines.push('Examples:');
lines.push(' pnpm eval --verbose');
lines.push(' pnpm eval --prompt "Create a Slack notification workflow"');
lines.push(' pnpm eval --prompts-csv my-prompts.csv --max-examples 5');
lines.push(' pnpm eval:langsmith --dataset "workflow-builder-canvas-prompts" --name "test-run"');
return lines.join('\n');
}
export function printHelp(): void {
console.log(formatHelp());
}
function ensureValue(argv: string[], i: number, flag: string): string {
const value = argv[i + 1];
if (value === undefined) throw new Error(`Flag ${flag} requires a value`);
return value;
}
function splitFlagToken(token: string): { flag: string; inlineValue?: string } {
if (!token.startsWith('--')) return { flag: token };
const equalsIndex = token.indexOf('=');
if (equalsIndex === -1) return { flag: token };
return { flag: token.slice(0, equalsIndex), inlineValue: token.slice(equalsIndex + 1) };
}
function isStringArray(value: unknown): value is string[] {
return Array.isArray(value) && value.every((v): v is string => typeof v === 'string');
}
function parseCli(argv: string[]): {
values: Partial<Record<CliKey, unknown>>;
seenKeys: Set<CliKey>;
} {
const values: Partial<Record<CliKey, unknown>> = {};
const seenKeys = new Set<CliKey>();
for (let i = 0; i < argv.length; i++) {
const token = argv[i];
if (!token.startsWith('-')) continue;
const { flag, inlineValue } = splitFlagToken(token);
const def = FLAG_TO_KEY[flag];
if (!def) {
throw new Error(`Unknown flag: ${flag}\n\nValid flags:\n ${formatValidFlags()}`);
}
seenKeys.add(def.key);
if (def.kind === 'boolean') {
values[def.key] = true;
continue;
}
const value = inlineValue ?? ensureValue(argv, i, flag);
if (inlineValue === undefined) i++;
if (def.key === 'filter') {
const existing = values.filter;
values.filter = isStringArray(existing) ? [...existing, value] : [value];
continue;
}
values[def.key] = value;
}
return { values, seenKeys };
}
function parseFeatureFlags(args: {
templateExamples: boolean;
suite: EvaluationSuite;
}): BuilderFeatureFlags | undefined {
const templateExamplesFromEnv = process.env.EVAL_FEATURE_TEMPLATE_EXAMPLES === 'true';
const templateExamples = templateExamplesFromEnv || args.templateExamples;
// Auto-enable introspection for introspection suite
const enableIntrospection = args.suite === 'introspection';
if (!templateExamples && !enableIntrospection) return undefined;
return {
templateExamples: templateExamples || undefined,
enableIntrospection: enableIntrospection || undefined,
};
}
function parseFilters(args: {
filter: string[];
notionId?: string;
technique?: string;
}): LangsmithExampleFilters | undefined {
const filters: LangsmithExampleFilters = {};
for (const raw of args.filter) {
const match = raw.match(/^(\w+):(.+)$/);
if (!match) {
throw new Error('Invalid `--filter` format. Expected: --filter "key:value"');
}
const [, key, valueRaw] = match;
const value = valueRaw.trim();
if (value.length === 0) {
throw new Error(`Invalid \`--filter\` value for "${key}": value cannot be empty`);
}
switch (key) {
case 'do':
filters.doSearch = value;
break;
case 'dont':
filters.dontSearch = value;
break;
case 'technique':
filters.technique = value;
break;
case 'id':
filters.notionId = value;
break;
default:
throw new Error(`Unknown filter key "${key}". Expected one of: do, dont, technique, id`);
}
}
if (args.notionId && !filters.notionId) filters.notionId = args.notionId;
if (args.technique && !filters.technique) filters.technique = args.technique;
const hasAny = Object.values(filters).some((v) => typeof v === 'string' && v.length > 0);
return hasAny ? filters : undefined;
}
export function parseEvaluationArgs(argv: string[] = process.argv.slice(2)): EvaluationArgs {
// Check for help flag before parsing
if (argv.includes('--help') || argv.includes('-h')) {
printHelp();
process.exit(0);
}
const { values, seenKeys } = parseCli(argv);
if (values.langsmith === true) {
const backendWasExplicit = seenKeys.has('backend');
if (backendWasExplicit && values.backend !== 'langsmith') {
throw new Error('Cannot combine `--langsmith` with `--backend local`');
}
values.backend = 'langsmith';
}
const parsed = cliSchema.parse(values);
const featureFlags = parseFeatureFlags({
templateExamples: parsed.templateExamples,
suite: parsed.suite,
});
const filters = parseFilters({
filter: parsed.filter,
notionId: parsed.notionId,
technique: parsed.technique,
});
if (parsed.suite !== 'pairwise' && (filters?.doSearch || filters?.dontSearch)) {
throw new Error(
'`--filter do:` and `--filter dont:` are only supported for `--suite pairwise`',
);
}
if (parsed.checks && parsed.suite !== 'binary-checks') {
throw new Error('`--checks` is only supported for `--suite binary-checks`');
}
return {
suite: parsed.suite,
backend: parsed.backend,
agent: parsed.agent,
verbose: parsed.verbose,
repetitions: parsed.repetitions,
concurrency: parsed.concurrency,
timeoutMs: parsed.timeoutMs,
experimentName: parsed.experimentName,
outputDir: parsed.outputDir,
outputCsv: parsed.outputCsv,
datasetName: parsed.datasetName,
maxExamples: parsed.maxExamples,
filters,
testCase: parsed.testCase,
promptsCsv: parsed.promptsCsv,
prompt: parsed.prompt,
dos: parsed.dos,
donts: parsed.donts,
numJudges: parsed.numJudges,
featureFlags,
webhookUrl: parsed.webhookUrl,
webhookSecret: parsed.webhookSecret,
checks: parsed.checks?.split(',').map((s) => s.trim()),
// Model configuration
model: parsed.model,
judgeModel: parsed.judgeModel,
supervisorModel: parsed.supervisorModel,
responderModel: parsed.responderModel,
discoveryModel: parsed.discoveryModel,
builderModel: parsed.builderModel,
parameterUpdaterModel: parsed.parameterUpdaterModel,
};
}
/**
* Converts EvaluationArgs to StageModels for use with environment setup.
*/
export function argsToStageModels(args: EvaluationArgs): StageModels {
return {
default: args.model,
supervisor: args.supervisorModel,
responder: args.responderModel,
discovery: args.discoveryModel,
builder: args.builderModel,
parameterUpdater: args.parameterUpdaterModel,
judge: args.judgeModel,
};
}
export function getDefaultExperimentName(suite: EvaluationSuite): string {
return suite === 'pairwise' ? DEFAULTS.EXPERIMENT_NAME : DEFAULTS.LLM_JUDGE_EXPERIMENT_NAME;
}
export function getDefaultDatasetName(suite: EvaluationSuite): string {
if (suite === 'pairwise') return DEFAULTS.DATASET_NAME;
return process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-canvas-prompts';
}