n8n/packages/@n8n/instance-ai/evaluations/subagent/runner.ts

190 lines
5.4 KiB
TypeScript

// ---------------------------------------------------------------------------
// Workflow-build eval runner
//
// Routes prompts through the normal Instance AI orchestrator build path and
// scores the resulting workflow with binary checks.
// ---------------------------------------------------------------------------
import type {
CapturedWorkflow,
Feedback,
WorkflowBuildEvalResult,
WorkflowBuildEvalConfig,
WorkflowBuildEvalCase,
} from './types';
import { runBinaryChecks } from '../binaryChecks/index';
import type { BinaryCheckContext } from '../binaryChecks/types';
import type { N8nClient, WorkflowResponse } from '../clients/n8n-client';
import { createLogger, type EvalLogger } from '../harness/logger';
import { buildWorkflow, cleanupBuild, type BuildResult } from '../harness/runner';
/**
* Client-side model used by binary checks (they call Anthropic directly with
* ANTHROPIC_API_KEY). Independent of the server-side agent model, which the
* server resolves from its own settings when the CLI doesn't pass `--model`.
*/
const BINARY_CHECK_DEFAULT_MODEL = 'anthropic/claude-sonnet-4-20250514';
export interface RunWorkflowBuildEvalDeps {
client: N8nClient;
/** Delete workflows after the run (default true). Disable with --keep-workflows. */
deleteAfterRun: boolean;
preRunWorkflowIds: Set<string>;
claimedWorkflowIds: Set<string>;
}
export async function runWorkflowBuildEval(
testCase: WorkflowBuildEvalCase,
config: WorkflowBuildEvalConfig,
deps: RunWorkflowBuildEvalDeps,
): Promise<WorkflowBuildEvalResult> {
const startMs = Date.now();
const modelId = testCase.modelId ?? config.modelId;
const logger = createRunnerLogger(config.verbose ?? false);
let build: BuildResult | undefined;
try {
build = await buildWorkflow({
client: deps.client,
conversation: [{ role: 'user', text: testCase.prompt }],
timeoutMs: config.timeoutMs,
preRunWorkflowIds: deps.preRunWorkflowIds,
claimedWorkflowIds: deps.claimedWorkflowIds,
logger,
skipWorkflowChecks: true,
});
const capturedWorkflows = build.workflowJsons.map(toCapturedWorkflow);
const agentTextResponse = extractAgentText(build);
const feedback = await evaluateCapturedWorkflows({
workflows: build.workflowJsons,
prompt: testCase.prompt,
modelId: modelId ?? BINARY_CHECK_DEFAULT_MODEL,
agentTextResponse,
...(testCase.annotations ? { annotations: testCase.annotations } : {}),
});
// Surface the orchestrator build error both as feedback (so LangSmith scores
// it) and as `result.error` (so the CLI printer shows it inline). Same
// string, two consumers — intentional.
if (build.error) {
feedback.unshift({
evaluator: 'workflow-build-runner',
metric: 'run_error',
score: 0,
kind: 'score',
comment: build.error,
});
}
const result: WorkflowBuildEvalResult = {
testCase,
text: agentTextResponse,
capturedWorkflows,
feedback,
durationMs: Date.now() - startMs,
};
if (build.error) result.error = build.error;
return result;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
testCase,
text: '',
capturedWorkflows: [],
feedback: [
{
evaluator: 'workflow-build-runner',
metric: 'run_error',
score: 0,
kind: 'score',
comment: message,
},
],
durationMs: Date.now() - startMs,
error: message,
};
} finally {
if (deps.deleteAfterRun && build) {
try {
await cleanupBuild(deps.client, build, logger);
} catch {
// cleanupBuild is best-effort; keep the eval result focused on build/scoring.
}
}
}
}
function toCapturedWorkflow(workflow: WorkflowResponse): CapturedWorkflow {
return {
json: {
name: workflow.name,
nodes: workflow.nodes,
connections: workflow.connections,
} as CapturedWorkflow['json'],
success: true,
};
}
function extractAgentText(build: BuildResult): string {
return (
build.transcript
?.map((turn) => turn.agentText)
.filter((text) => text.length > 0)
.join('\n\n') ?? ''
);
}
function createRunnerLogger(verbose: boolean): EvalLogger {
if (verbose) return createLogger(true);
return {
info: () => {},
verbose: () => {},
success: () => {},
warn: () => {},
error: () => {},
isVerbose: false,
};
}
// ---------------------------------------------------------------------------
// Internal: score each captured workflow
// ---------------------------------------------------------------------------
async function evaluateCapturedWorkflows(args: {
workflows: WorkflowResponse[];
prompt: string;
modelId: string;
agentTextResponse: string;
annotations?: Record<string, unknown>;
}): Promise<Feedback[]> {
const feedback: Feedback[] = [];
feedback.push({
evaluator: 'workflow-build-runner',
metric: 'workflow_produced',
score: args.workflows.length > 0 ? 1 : 0,
kind: 'score',
comment:
args.workflows.length > 0
? `${String(args.workflows.length)} workflow(s) produced and round-tripped`
: 'Agent did not produce any workflow',
});
if (args.workflows.length === 0) return feedback;
const last = args.workflows[args.workflows.length - 1];
const ctx: BinaryCheckContext = {
prompt: args.prompt,
modelId: args.modelId,
...(args.agentTextResponse ? { agentTextResponse: args.agentTextResponse } : {}),
...(args.annotations ? { annotations: args.annotations } : {}),
};
const { feedback: binaryFeedback } = await runBinaryChecks(last, ctx);
feedback.push(...binaryFeedback);
return feedback;
}