n8n/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/argument-parser.ts

/* eslint-disable @typescript-eslint/naming-convention */
import { z } from 'zod';

import { AVAILABLE_MODELS, DEFAULT_MODEL, type ModelId } from '@/llm-config';
import type { BuilderFeatureFlags } from '@/workflow-builder-agent';

import type { LangsmithExampleFilters } from '../harness/harness-types';
import { DEFAULTS } from '../support/constants';
import type { StageModels } from '../support/environment';

export type EvaluationSuite =
	| 'llm-judge'
	| 'pairwise'
	| 'programmatic'
	| 'similarity'
	| 'introspection'
	| 'binary-checks';
export type EvaluationBackend = 'local' | 'langsmith';
export type AgentType = 'multi-agent' | 'code-builder';

export interface EvaluationArgs {
	suite: EvaluationSuite;
	backend: EvaluationBackend;
	agent: AgentType;

	verbose: boolean;
	repetitions: number;
	concurrency: number;
	timeoutMs: number;
	experimentName?: string;
	outputDir?: string;
	datasetName?: string;
	maxExamples?: number;
	filters?: LangsmithExampleFilters;

	testCase?: string;
	promptsCsv?: string;

	prompt?: string;
	dos?: string;
	donts?: string;

	numJudges: number;

	featureFlags?: BuilderFeatureFlags;

	/** URL to POST evaluation results to when complete */
	webhookUrl?: string;
	/** Secret for HMAC-SHA256 signature of webhook payload */
	webhookSecret?: string;

	/** CSV file path for evaluation results */
	outputCsv?: string;

	/** Comma-separated list of binary check names to run */
	checks?: string[];

	// Model configuration
	/** Default model for all stages */
	model: ModelId;
	/** Model for LLM judge evaluation */
	judgeModel?: ModelId;
	/** Model for supervisor stage */
	supervisorModel?: ModelId;
	/** Model for responder stage */
	responderModel?: ModelId;
	/** Model for discovery stage */
	discoveryModel?: ModelId;
	/** Model for builder stage (structure and configuration) */
	builderModel?: ModelId;
	/** Model for parameter updater (within builder) */
	parameterUpdaterModel?: ModelId;
}

type CliValueKind = 'boolean' | 'string';
type FlagGroup =
	| 'input'
	| 'eval'
	| 'pairwise'
	| 'langsmith'
	| 'output'
	| 'feature'
	| 'model'
	| 'advanced';

// Model ID validation schema
const modelIdSchema = z.enum(AVAILABLE_MODELS as [ModelId, ...ModelId[]]);

const cliSchema = z
	.object({
		suite: z
			.enum([
				'llm-judge',
				'pairwise',
				'programmatic',
				'similarity',
				'introspection',
				'binary-checks',
			])
			.default('llm-judge'),
		backend: z.enum(['local', 'langsmith']).default('local'),
		agent: z.enum(['code-builder', 'multi-agent']).default('code-builder'),

		verbose: z.boolean().default(false),
		repetitions: z.coerce.number().int().positive().default(DEFAULTS.REPETITIONS),
		concurrency: z.coerce.number().int().positive().default(DEFAULTS.CONCURRENCY),
		timeoutMs: z.coerce.number().int().positive().default(DEFAULTS.TIMEOUT_MS),
		experimentName: z.string().min(1).optional(),
		outputDir: z.string().min(1).optional(),
		outputCsv: z.string().min(1).optional(),
		datasetName: z.string().min(1).optional(),
		maxExamples: z.coerce.number().int().positive().optional(),
		filter: z.array(z.string().min(1)).default([]),
		notionId: z.string().min(1).optional(),
		technique: z.string().min(1).optional(),

		testCase: z.string().min(1).optional(),
		promptsCsv: z.string().min(1).optional(),

		prompt: z.string().min(1).optional(),
		dos: z.string().min(1).optional(),
		donts: z.string().min(1).optional(),

		numJudges: z.coerce.number().int().positive().default(DEFAULTS.NUM_JUDGES),

		checks: z.string().min(1).optional(),
		langsmith: z.boolean().optional(),
		templateExamples: z.boolean().default(false),
		webhookUrl: z.string().url().optional(),
		webhookSecret: z.string().min(16).optional(),

		// Model configuration
		model: modelIdSchema.default(DEFAULT_MODEL),
		judgeModel: modelIdSchema.optional(),
		supervisorModel: modelIdSchema.optional(),
		responderModel: modelIdSchema.optional(),
		discoveryModel: modelIdSchema.optional(),
		builderModel: modelIdSchema.optional(),
		parameterUpdaterModel: modelIdSchema.optional(),
	})
	.strict();

type CliKey = keyof z.infer<typeof cliSchema>;

type FlagDef = { key: CliKey; kind: CliValueKind; desc: string; group: FlagGroup };

const FLAG_DEFS: Record<string, FlagDef> = {
	// Input sources
	'--prompt': { key: 'prompt', kind: 'string', group: 'input', desc: 'Single prompt to evaluate' },
	'--prompts-csv': {
		key: 'promptsCsv',
		kind: 'string',
		group: 'input',
		desc: 'CSV file with prompts',
	},
	'--test-case': {
		key: 'testCase',
		kind: 'string',
		group: 'input',
		desc: 'Run specific default test case by ID',
	},
	'--dataset': {
		key: 'datasetName',
		kind: 'string',
		group: 'input',
		desc: 'LangSmith dataset name',
	},

	// Evaluation options
	'--suite': {
		key: 'suite',
		kind: 'string',
		group: 'eval',
		desc: 'Evaluation suite (llm-judge|pairwise|programmatic|similarity|introspection|binary-checks)',
	},
	'--checks': {
		key: 'checks',
		kind: 'string',
		group: 'eval',
		desc: 'Comma-separated binary check names to run (binary-checks suite only)',
	},
	'--backend': { key: 'backend', kind: 'string', group: 'eval', desc: 'Backend (local|langsmith)' },
	'--agent': {
		key: 'agent',
		kind: 'string',
		group: 'eval',
		desc: 'Agent type (code-builder|multi-agent)',
	},
	'--max-examples': {
		key: 'maxExamples',
		kind: 'string',
		group: 'eval',
		desc: 'Limit number of examples',
	},
	'--repetitions': {
		key: 'repetitions',
		kind: 'string',
		group: 'eval',
		desc: 'Repeat each example N times',
	},
	'--concurrency': {
		key: 'concurrency',
		kind: 'string',
		group: 'eval',
		desc: 'Max parallel evaluations',
	},
	'--timeout-ms': {
		key: 'timeoutMs',
		kind: 'string',
		group: 'eval',
		desc: 'Timeout per evaluation (ms)',
	},

	// Pairwise options
	'--dos': {
		key: 'dos',
		kind: 'string',
		group: 'pairwise',
		desc: 'Requirements the workflow must satisfy',
	},
	'--donts': {
		key: 'donts',
		kind: 'string',
		group: 'pairwise',
		desc: 'Things the workflow must avoid',
	},

	// LangSmith options
	'--langsmith': {
		key: 'langsmith',
		kind: 'boolean',
		group: 'langsmith',
		desc: 'Shorthand for --backend langsmith',
	},
	'--name': { key: 'experimentName', kind: 'string', group: 'langsmith', desc: 'Experiment name' },
	'--filter': {
		key: 'filter',
		kind: 'string',
		group: 'langsmith',
		desc: 'Filter examples (key:value, repeatable)',
	},
	'--notion-id': {
		key: 'notionId',
		kind: 'string',
		group: 'langsmith',
		desc: 'Filter by Notion ID',
	},
	'--technique': {
		key: 'technique',
		kind: 'string',
		group: 'langsmith',
		desc: 'Filter by technique',
	},

	// Output
	'--output-dir': {
		key: 'outputDir',
		kind: 'string',
		group: 'output',
		desc: 'Directory for artifacts',
	},
	'--output-csv': {
		key: 'outputCsv',
		kind: 'string',
		group: 'output',
		desc: 'CSV file for evaluation results - if pre-existing file found it will be overwritten',
	},
	'--verbose': { key: 'verbose', kind: 'boolean', group: 'output', desc: 'Verbose logging' },
	'--webhook-url': {
		key: 'webhookUrl',
		kind: 'string',
		group: 'output',
		desc: 'URL to POST results to when complete',
	},
	'--webhook-secret': {
		key: 'webhookSecret',
		kind: 'string',
		group: 'output',
		desc: 'Secret for HMAC-SHA256 signature (min 16 chars)',
	},

	// Feature flags
	'--template-examples': {
		key: 'templateExamples',
		kind: 'boolean',
		group: 'feature',
		desc: 'Enable template examples phase',
	},

	// Model configuration
	'--model': {
		key: 'model',
		kind: 'string',
		group: 'model',
		desc: `Default model for all stages (default: ${DEFAULT_MODEL})`,
	},
	'--judge-model': {
		key: 'judgeModel',
		kind: 'string',
		group: 'model',
		desc: 'Model for LLM judge evaluation',
	},
	'--supervisor-model': {
		key: 'supervisorModel',
		kind: 'string',
		group: 'model',
		desc: 'Model for supervisor stage',
	},
	'--responder-model': {
		key: 'responderModel',
		kind: 'string',
		group: 'model',
		desc: 'Model for responder stage',
	},
	'--discovery-model': {
		key: 'discoveryModel',
		kind: 'string',
		group: 'model',
		desc: 'Model for discovery stage',
	},
	'--builder-model': {
		key: 'builderModel',
		kind: 'string',
		group: 'model',
		desc: 'Model for builder stage (structure and configuration)',
	},
	'--parameter-updater-model': {
		key: 'parameterUpdaterModel',
		kind: 'string',
		group: 'model',
		desc: 'Model for parameter updater',
	},

	// Advanced
	'--judges': { key: 'numJudges', kind: 'string', group: 'advanced', desc: 'Number of LLM judges' },
};

// Aliases (not shown in help)
const FLAG_ALIASES: Record<string, string> = {
	'--mode': '--suite',
	'-v': '--verbose',
};

// Combined lookup for parsing
const FLAG_TO_KEY: Record<string, FlagDef> = {
	...FLAG_DEFS,
	...Object.fromEntries(
		Object.entries(FLAG_ALIASES).map(([alias, target]) => [alias, FLAG_DEFS[target]]),
	),
};

function formatValidFlags(): string {
	return Object.keys(FLAG_TO_KEY)
		.filter((f) => f.startsWith('--'))
		.sort()
		.join('\n  ');
}

const GROUP_TITLES: Record<FlagGroup, string> = {
	input: 'Input Sources',
	eval: 'Evaluation Options',
	pairwise: 'Pairwise Options',
	langsmith: 'LangSmith Options',
	output: 'Output',
	feature: 'Feature Flags',
	model: 'Model Configuration',
	advanced: 'Advanced',
};

function formatHelp(): string {
	const lines: string[] = [
		'Usage: pnpm eval [options]',
		'',
		'Evaluation harness for AI Workflow Builder.',
		'',
	];

	const groups: FlagGroup[] = [
		'input',
		'eval',
		'pairwise',
		'langsmith',
		'output',
		'feature',
		'model',
		'advanced',
	];

	for (const group of groups) {
		const flags = Object.entries(FLAG_DEFS).filter(([, def]) => def.group === group);
		if (flags.length === 0) continue;

		lines.push(`${GROUP_TITLES[group]}:`);
		for (const [flag, def] of flags) {
			const valueHint = def.kind === 'string' ? ' <value>' : '';
			const padded = `  ${flag}${valueHint}`.padEnd(28);
			lines.push(`${padded}${def.desc}`);
		}
		lines.push('');
	}

	lines.push('Examples:');
	lines.push('  pnpm eval --verbose');
	lines.push('  pnpm eval --prompt "Create a Slack notification workflow"');
	lines.push('  pnpm eval --prompts-csv my-prompts.csv --max-examples 5');
	lines.push('  pnpm eval:langsmith --dataset "workflow-builder-canvas-prompts" --name "test-run"');

	return lines.join('\n');
}

export function printHelp(): void {
	console.log(formatHelp());
}

function ensureValue(argv: string[], i: number, flag: string): string {
	const value = argv[i + 1];
	if (value === undefined) throw new Error(`Flag ${flag} requires a value`);
	return value;
}

function splitFlagToken(token: string): { flag: string; inlineValue?: string } {
	if (!token.startsWith('--')) return { flag: token };
	const equalsIndex = token.indexOf('=');
	if (equalsIndex === -1) return { flag: token };
	return { flag: token.slice(0, equalsIndex), inlineValue: token.slice(equalsIndex + 1) };
}

function isStringArray(value: unknown): value is string[] {
	return Array.isArray(value) && value.every((v): v is string => typeof v === 'string');
}

function parseCli(argv: string[]): {
	values: Partial<Record<CliKey, unknown>>;
	seenKeys: Set<CliKey>;
} {
	const values: Partial<Record<CliKey, unknown>> = {};
	const seenKeys = new Set<CliKey>();

	for (let i = 0; i < argv.length; i++) {
		const token = argv[i];
		if (!token.startsWith('-')) continue;

		const { flag, inlineValue } = splitFlagToken(token);
		const def = FLAG_TO_KEY[flag];

		if (!def) {
			throw new Error(`Unknown flag: ${flag}\n\nValid flags:\n  ${formatValidFlags()}`);
		}

		seenKeys.add(def.key);

		if (def.kind === 'boolean') {
			values[def.key] = true;
			continue;
		}

		const value = inlineValue ?? ensureValue(argv, i, flag);
		if (inlineValue === undefined) i++;

		if (def.key === 'filter') {
			const existing = values.filter;
			values.filter = isStringArray(existing) ? [...existing, value] : [value];
			continue;
		}

		values[def.key] = value;
	}

	return { values, seenKeys };
}

function parseFeatureFlags(args: {
	templateExamples: boolean;
	suite: EvaluationSuite;
}): BuilderFeatureFlags | undefined {
	const templateExamplesFromEnv = process.env.EVAL_FEATURE_TEMPLATE_EXAMPLES === 'true';
	const templateExamples = templateExamplesFromEnv || args.templateExamples;

	// Auto-enable introspection for introspection suite
	const enableIntrospection = args.suite === 'introspection';

	if (!templateExamples && !enableIntrospection) return undefined;

	return {
		templateExamples: templateExamples || undefined,
		enableIntrospection: enableIntrospection || undefined,
	};
}

function parseFilters(args: {
	filter: string[];
	notionId?: string;
	technique?: string;
}): LangsmithExampleFilters | undefined {
	const filters: LangsmithExampleFilters = {};

	for (const raw of args.filter) {
		const match = raw.match(/^(\w+):(.+)$/);
		if (!match) {
			throw new Error('Invalid `--filter` format. Expected: --filter "key:value"');
		}

		const [, key, valueRaw] = match;
		const value = valueRaw.trim();
		if (value.length === 0) {
			throw new Error(`Invalid \`--filter\` value for "${key}": value cannot be empty`);
		}
		switch (key) {
			case 'do':
				filters.doSearch = value;
				break;
			case 'dont':
				filters.dontSearch = value;
				break;
			case 'technique':
				filters.technique = value;
				break;
			case 'id':
				filters.notionId = value;
				break;
			default:
				throw new Error(`Unknown filter key "${key}". Expected one of: do, dont, technique, id`);
		}
	}

	if (args.notionId && !filters.notionId) filters.notionId = args.notionId;
	if (args.technique && !filters.technique) filters.technique = args.technique;

	const hasAny = Object.values(filters).some((v) => typeof v === 'string' && v.length > 0);
	return hasAny ? filters : undefined;
}

export function parseEvaluationArgs(argv: string[] = process.argv.slice(2)): EvaluationArgs {
	// Check for help flag before parsing
	if (argv.includes('--help') || argv.includes('-h')) {
		printHelp();
		process.exit(0);
	}

	const { values, seenKeys } = parseCli(argv);

	if (values.langsmith === true) {
		const backendWasExplicit = seenKeys.has('backend');
		if (backendWasExplicit && values.backend !== 'langsmith') {
			throw new Error('Cannot combine `--langsmith` with `--backend local`');
		}
		values.backend = 'langsmith';
	}

	const parsed = cliSchema.parse(values);

	const featureFlags = parseFeatureFlags({
		templateExamples: parsed.templateExamples,
		suite: parsed.suite,
	});

	const filters = parseFilters({
		filter: parsed.filter,
		notionId: parsed.notionId,
		technique: parsed.technique,
	});

	if (parsed.suite !== 'pairwise' && (filters?.doSearch || filters?.dontSearch)) {
		throw new Error(
			'`--filter do:` and `--filter dont:` are only supported for `--suite pairwise`',
		);
	}

	if (parsed.checks && parsed.suite !== 'binary-checks') {
		throw new Error('`--checks` is only supported for `--suite binary-checks`');
	}

	return {
		suite: parsed.suite,
		backend: parsed.backend,
		agent: parsed.agent,
		verbose: parsed.verbose,
		repetitions: parsed.repetitions,
		concurrency: parsed.concurrency,
		timeoutMs: parsed.timeoutMs,
		experimentName: parsed.experimentName,
		outputDir: parsed.outputDir,
		outputCsv: parsed.outputCsv,
		datasetName: parsed.datasetName,
		maxExamples: parsed.maxExamples,
		filters,
		testCase: parsed.testCase,
		promptsCsv: parsed.promptsCsv,
		prompt: parsed.prompt,
		dos: parsed.dos,
		donts: parsed.donts,
		numJudges: parsed.numJudges,
		featureFlags,
		webhookUrl: parsed.webhookUrl,
		webhookSecret: parsed.webhookSecret,
		checks: parsed.checks?.split(',').map((s) => s.trim()),
		// Model configuration
		model: parsed.model,
		judgeModel: parsed.judgeModel,
		supervisorModel: parsed.supervisorModel,
		responderModel: parsed.responderModel,
		discoveryModel: parsed.discoveryModel,
		builderModel: parsed.builderModel,
		parameterUpdaterModel: parsed.parameterUpdaterModel,
	};
}

/**
 * Converts EvaluationArgs to StageModels for use with environment setup.
 */
export function argsToStageModels(args: EvaluationArgs): StageModels {
	return {
		default: args.model,
		supervisor: args.supervisorModel,
		responder: args.responderModel,
		discovery: args.discoveryModel,
		builder: args.builderModel,
		parameterUpdater: args.parameterUpdaterModel,
		judge: args.judgeModel,
	};
}

export function getDefaultExperimentName(suite: EvaluationSuite): string {
	return suite === 'pairwise' ? DEFAULTS.EXPERIMENT_NAME : DEFAULTS.LLM_JUDGE_EXPERIMENT_NAME;
}

export function getDefaultDatasetName(suite: EvaluationSuite): string {
	if (suite === 'pairwise') return DEFAULTS.DATASET_NAME;
	return process.env.LANGSMITH_DATASET_NAME ?? 'workflow-builder-canvas-prompts';
}