n8n/packages/@n8n/instance-ai/evaluations/subagent/runner.ts
Benjamin Schroth c961849226
feat(ai-builder): Add sub-agent evaluation harness with binary checks (no-changelog) (#28289)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 07:50:46 +00:00

179 lines
5.4 KiB
TypeScript

// ---------------------------------------------------------------------------
// HTTP-driven sub-agent runner
//
// Delegates execution to the n8n server's /rest/instance-ai/eval/run-sub-agent
// endpoint, then fetches the resulting workflows via REST and scores them
// with the existing binary-check suite.
// ---------------------------------------------------------------------------
import type {
CapturedWorkflow,
Feedback,
SubAgentResult,
SubAgentRunnerConfig,
SubAgentTestCase,
} from './types';
import { runBinaryChecks } from '../binaryChecks/index';
import type { BinaryCheckContext } from '../binaryChecks/types';
import type { N8nClient, WorkflowResponse } from '../clients/n8n-client';
/**
* Client-side model used by binary checks (they call Anthropic directly with
* ANTHROPIC_API_KEY). Independent of the server-side agent model, which the
* server resolves from its own settings when the CLI doesn't pass `--model`.
*/
const BINARY_CHECK_DEFAULT_MODEL = 'anthropic/claude-sonnet-4-20250514';
export interface RunSubAgentDeps {
client: N8nClient;
/** Delete workflows after the run (default true). Disable with --keep-workflows. */
deleteAfterRun: boolean;
}
export async function runSubAgent(
testCase: SubAgentTestCase,
config: SubAgentRunnerConfig,
deps: RunSubAgentDeps,
): Promise<SubAgentResult> {
const startMs = Date.now();
const role = testCase.subagent ?? 'builder';
const modelId = testCase.modelId ?? config.modelId;
try {
const response = await deps.client.runSubAgentEval({
role,
prompt: testCase.prompt,
...(modelId !== undefined ? { modelId } : {}),
...(testCase.maxSteps !== undefined ? { maxSteps: testCase.maxSteps } : {}),
...(config.timeoutMs !== undefined ? { timeoutMs: config.timeoutMs } : {}),
});
// Fetch each captured workflow to prove it round-trips through the real importer.
const capturedWorkflows: CapturedWorkflow[] = [];
const workflowResponses: WorkflowResponse[] = [];
for (const id of response.capturedWorkflowIds) {
try {
const wf = await deps.client.getWorkflow(id);
workflowResponses.push(wf);
capturedWorkflows.push({
json: {
name: wf.name,
nodes: wf.nodes,
connections: wf.connections,
} as CapturedWorkflow['json'],
success: true,
});
} catch (fetchError) {
const message = fetchError instanceof Error ? fetchError.message : String(fetchError);
capturedWorkflows.push({
json: { name: `fetch-failed-${id}` } as CapturedWorkflow['json'],
success: false,
errors: [`Failed to fetch workflow ${id}: ${message}`],
});
}
}
const feedback = await evaluateCapturedWorkflows({
workflows: workflowResponses,
prompt: testCase.prompt,
modelId: modelId ?? BINARY_CHECK_DEFAULT_MODEL,
agentTextResponse: response.text,
...(testCase.annotations ? { annotations: testCase.annotations } : {}),
});
// Surface the server-side run error both as feedback (so LangSmith scores
// it) and as `result.error` (so the CLI printer shows it inline). Same
// string, two consumers — intentional.
if (response.error) {
feedback.unshift({
evaluator: 'subagent-runner',
metric: 'run_error',
score: 0,
kind: 'score',
comment: response.error,
});
}
// Cleanup (best-effort — never fails the run). Run in parallel to keep
// per-case tail latency low when the agent produced several workflows.
if (deps.deleteAfterRun && response.capturedWorkflowIds.length > 0) {
await Promise.all(
response.capturedWorkflowIds.map(async (id) => {
try {
await deps.client.deleteWorkflow(id);
} catch {
// Intentionally swallow — cleanup failure is not a test failure.
}
}),
);
}
const result: SubAgentResult = {
testCase,
text: response.text,
capturedWorkflows,
feedback,
durationMs: Date.now() - startMs,
};
if (response.error) result.error = response.error;
return result;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
testCase,
text: '',
capturedWorkflows: [],
feedback: [
{
evaluator: 'subagent-runner',
metric: 'run_error',
score: 0,
kind: 'score',
comment: message,
},
],
durationMs: Date.now() - startMs,
error: message,
};
}
}
// ---------------------------------------------------------------------------
// Internal: score each captured workflow
// ---------------------------------------------------------------------------
async function evaluateCapturedWorkflows(args: {
workflows: WorkflowResponse[];
prompt: string;
modelId: string;
agentTextResponse: string;
annotations?: Record<string, unknown>;
}): Promise<Feedback[]> {
const feedback: Feedback[] = [];
feedback.push({
evaluator: 'subagent-runner',
metric: 'workflow_produced',
score: args.workflows.length > 0 ? 1 : 0,
kind: 'score',
comment:
args.workflows.length > 0
? `${String(args.workflows.length)} workflow(s) produced and round-tripped`
: 'Agent did not produce any workflow',
});
if (args.workflows.length === 0) return feedback;
const last = args.workflows[args.workflows.length - 1];
const ctx: BinaryCheckContext = {
prompt: args.prompt,
modelId: args.modelId,
...(args.agentTextResponse ? { agentTextResponse: args.agentTextResponse } : {}),
...(args.annotations ? { annotations: args.annotations } : {}),
};
const binaryFeedback = await runBinaryChecks(last, ctx);
feedback.push(...binaryFeedback);
return feedback;
}