// --------------------------------------------------------------------------- // Workflow-build eval runner // // Routes prompts through the normal Instance AI orchestrator build path and // scores the resulting workflow with binary checks. // --------------------------------------------------------------------------- import type { CapturedWorkflow, Feedback, WorkflowBuildEvalResult, WorkflowBuildEvalConfig, WorkflowBuildEvalCase, } from './types'; import { runBinaryChecks } from '../binaryChecks/index'; import type { BinaryCheckContext } from '../binaryChecks/types'; import type { N8nClient, WorkflowResponse } from '../clients/n8n-client'; import { createLogger, type EvalLogger } from '../harness/logger'; import { buildWorkflow, cleanupBuild, type BuildResult } from '../harness/runner'; /** * Client-side model used by binary checks (they call Anthropic directly with * ANTHROPIC_API_KEY). Independent of the server-side agent model, which the * server resolves from its own settings when the CLI doesn't pass `--model`. */ const BINARY_CHECK_DEFAULT_MODEL = 'anthropic/claude-sonnet-4-20250514'; export interface RunWorkflowBuildEvalDeps { client: N8nClient; /** Delete workflows after the run (default true). Disable with --keep-workflows. */ deleteAfterRun: boolean; preRunWorkflowIds: Set; claimedWorkflowIds: Set; } export async function runWorkflowBuildEval( testCase: WorkflowBuildEvalCase, config: WorkflowBuildEvalConfig, deps: RunWorkflowBuildEvalDeps, ): Promise { const startMs = Date.now(); const modelId = testCase.modelId ?? config.modelId; const logger = createRunnerLogger(config.verbose ?? false); let build: BuildResult | undefined; try { build = await buildWorkflow({ client: deps.client, conversation: [{ role: 'user', text: testCase.prompt }], timeoutMs: config.timeoutMs, preRunWorkflowIds: deps.preRunWorkflowIds, claimedWorkflowIds: deps.claimedWorkflowIds, logger, skipWorkflowChecks: true, }); const capturedWorkflows = build.workflowJsons.map(toCapturedWorkflow); const agentTextResponse = extractAgentText(build); const feedback = await evaluateCapturedWorkflows({ workflows: build.workflowJsons, prompt: testCase.prompt, modelId: modelId ?? BINARY_CHECK_DEFAULT_MODEL, agentTextResponse, ...(testCase.annotations ? { annotations: testCase.annotations } : {}), }); // Surface the orchestrator build error both as feedback (so LangSmith scores // it) and as `result.error` (so the CLI printer shows it inline). Same // string, two consumers — intentional. if (build.error) { feedback.unshift({ evaluator: 'workflow-build-runner', metric: 'run_error', score: 0, kind: 'score', comment: build.error, }); } const result: WorkflowBuildEvalResult = { testCase, text: agentTextResponse, capturedWorkflows, feedback, durationMs: Date.now() - startMs, }; if (build.error) result.error = build.error; return result; } catch (error) { const message = error instanceof Error ? error.message : String(error); return { testCase, text: '', capturedWorkflows: [], feedback: [ { evaluator: 'workflow-build-runner', metric: 'run_error', score: 0, kind: 'score', comment: message, }, ], durationMs: Date.now() - startMs, error: message, }; } finally { if (deps.deleteAfterRun && build) { try { await cleanupBuild(deps.client, build, logger); } catch { // cleanupBuild is best-effort; keep the eval result focused on build/scoring. } } } } function toCapturedWorkflow(workflow: WorkflowResponse): CapturedWorkflow { return { json: { name: workflow.name, nodes: workflow.nodes, connections: workflow.connections, } as CapturedWorkflow['json'], success: true, }; } function extractAgentText(build: BuildResult): string { return ( build.transcript ?.map((turn) => turn.agentText) .filter((text) => text.length > 0) .join('\n\n') ?? '' ); } function createRunnerLogger(verbose: boolean): EvalLogger { if (verbose) return createLogger(true); return { info: () => {}, verbose: () => {}, success: () => {}, warn: () => {}, error: () => {}, isVerbose: false, }; } // --------------------------------------------------------------------------- // Internal: score each captured workflow // --------------------------------------------------------------------------- async function evaluateCapturedWorkflows(args: { workflows: WorkflowResponse[]; prompt: string; modelId: string; agentTextResponse: string; annotations?: Record; }): Promise { const feedback: Feedback[] = []; feedback.push({ evaluator: 'workflow-build-runner', metric: 'workflow_produced', score: args.workflows.length > 0 ? 1 : 0, kind: 'score', comment: args.workflows.length > 0 ? `${String(args.workflows.length)} workflow(s) produced and round-tripped` : 'Agent did not produce any workflow', }); if (args.workflows.length === 0) return feedback; const last = args.workflows[args.workflows.length - 1]; const ctx: BinaryCheckContext = { prompt: args.prompt, modelId: args.modelId, ...(args.agentTextResponse ? { agentTextResponse: args.agentTextResponse } : {}), ...(args.annotations ? { annotations: args.annotations } : {}), }; const { feedback: binaryFeedback } = await runBinaryChecks(last, ctx); feedback.push(...binaryFeedback); return feedback; }