chore: Align pairwise eval builder with production handover (#30019)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 16:10:30 +02:00 · 2026-05-11 13:00:37 +02:00 · 2026-05-11 13:00:37 +02:00 · d0367a00e8
commit d0367a00e8
parent 7094b48c94
7 changed files with 499 additions and 44 deletions
--- a/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts
@ -43,6 +43,13 @@ export interface BuilderRecord {
 	feedback: FeedbackEntry[];
 	tokenInput?: number;
 	tokenOutput?: number;
+	/** Number of `submit-workflow` calls during the build. IA-only — EE
+	 *  doesn't capture a tool-call timeline in the comparable shape. */
+	submitCalls?: number;
+	/** Number of tool calls that errored or returned a failed result. */
+	toolCallErrors?: number;
+	/** Total tool calls observed, used as the error-rate denominator. */
+	toolCallsTotal?: number;
 }

 interface BuilderSummary {
@ -59,6 +66,17 @@ interface BuilderSummary {
 		primaryPassRate: number;
 		avgDiagnostic: number;
 		avgDurationMs: number;
+		/** Total `submit-workflow` calls aggregated across IA records. Undefined
+		 *  for EE (which doesn't capture a comparable tool-call timeline). */
+		submitCallsTotal?: number;
+		/** Mean `submit-workflow` calls per record (IA only). */
+		avgSubmitCalls?: number;
+		/** Total tool calls observed across IA records. */
+		toolCallsTotal?: number;
+		/** Total errored tool calls observed across IA records. */
+		toolCallErrors?: number;
+		/** `toolCallErrors / toolCallsTotal` micro-averaged. IA-only. */
+		toolCallErrorRate?: number;
 	};
 }

@ -71,6 +89,16 @@ interface BuilderRun {
 // Instance AI loader (writes results.jsonl + workflows/<id>.json + summary.json)
 // ---------------------------------------------------------------------------

+interface IAToolCallTrace {
+	step: number;
+	toolCallId: string;
+	toolName: string;
+	args?: unknown;
+	result?: unknown;
+	error?: string;
+	elapsedMs?: number;
+}
+
 interface IAResultRecord {
 	exampleId: string;
 	iteration: number;
@ -86,6 +114,25 @@ interface IAResultRecord {
 		tokenUsage?: { input?: number; output?: number };
 	};
 	feedback: Array<{ metric: string; score: number; kind?: string; comment?: string }>;
+	toolCalls?: IAToolCallTrace[];
+}
+
+/**
+ * Whether a tool call should count toward the "tool error rate" metric.
+ * Mirrors `isErroredToolCall` in `pairwise.ts`.
+ */
+function isErroredIAToolCall(trace: IAToolCallTrace): boolean {
+	if (trace.error !== undefined) return true;
+	const r = trace.result;
+	if (r === null || r === undefined) return false;
+	if (typeof r === 'object' && !Array.isArray(r)) {
+		const obj = r as Record<string, unknown>;
+		if (obj.success === false) return true;
+		if (typeof obj.error === 'string' && obj.error.length > 0) return true;
+		if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
+	}
+	if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true;
+	return false;
 }

 interface IASummary {
@ -125,7 +172,9 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
 		// Use only iteration 1 for a fair 1:1 comparison.
 		.filter((r) => r.iteration === 1);

-	const normalized: BuilderRecord[] = records.map((r) => ({
+	const normalized: BuilderRecord[] = records.map((r) => {
+		const tcs = r.toolCalls ?? [];
+		return {
 			prompt: r.prompt,
 			exampleId: r.exampleId,
 			dos: r.dos,
@ -138,7 +187,11 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
 			feedback: r.feedback,
 			tokenInput: r.build.tokenUsage?.input,
 			tokenOutput: r.build.tokenUsage?.output,
-	}));
+			submitCalls: tcs.filter((tc) => tc.toolName === 'submit-workflow').length,
+			toolCallErrors: tcs.filter(isErroredIAToolCall).length,
+			toolCallsTotal: tcs.length,
+		};
+	});

 	const avgDuration =
 		normalized.length === 0
@ -166,6 +219,10 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
 			? 0
 			: diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length;

+	const submitCallsTotal = normalized.reduce((s, r) => s + (r.submitCalls ?? 0), 0);
+	const toolCallsTotal = normalized.reduce((s, r) => s + (r.toolCallsTotal ?? 0), 0);
+	const toolCallErrors = normalized.reduce((s, r) => s + (r.toolCallErrors ?? 0), 0);
+
 	return {
 		summary: {
 			label: `${summary.builder} (instance-ai)`,
@ -181,6 +238,11 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
 				primaryPassRate,
 				avgDiagnostic,
 				avgDurationMs: avgDuration,
+				submitCallsTotal,
+				avgSubmitCalls: normalized.length ? submitCallsTotal / normalized.length : 0,
+				toolCallsTotal,
+				toolCallErrors,
+				toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0,
 			},
 		},
 		records: normalized,
@ -564,6 +626,12 @@ function renderBuilderColumn(label: string, record: BuilderRecord | undefined):
 	if (record.tokenInput !== undefined && record.tokenOutput !== undefined) {
 		metaParts.push(`<span>${record.tokenInput}+${record.tokenOutput} tok</span>`);
 	}
+	if (record.submitCalls !== undefined && record.submitCalls > 0) {
+		metaParts.push(`<span>submit ×${record.submitCalls}</span>`);
+	}
+	if (record.toolCallErrors !== undefined && record.toolCallErrors > 0) {
+		metaParts.push(`<span>err ×${record.toolCallErrors}</span>`);
+	}

 	const errorBlock = record.errorMessage
 		? `<div class="error">${escapeHtml(record.errorMessage)}</div>`
@ -676,6 +744,16 @@ function renderSummaryCard(
  <div class="metric"><strong>${summary.totals.avgDiagnostic.toFixed(2)}</strong><span>avg diagnostic</span></div>
  <div class="metric"><strong>${formatDuration(summary.totals.avgDurationMs)}</strong><span>avg build time</span></div>
  <div class="metric"><strong>${summary.totals.buildSuccess}/${totalRecords}</strong><span>built ok</span></div>
+  ${
+		summary.totals.toolCallErrorRate !== undefined
+			? `<div class="metric"><strong>${pct(summary.totals.toolCallErrorRate)}</strong><span>tool error rate (${summary.totals.toolCallErrors ?? 0}/${summary.totals.toolCallsTotal ?? 0})</span></div>`
+			: ''
+	}
+  ${
+		summary.totals.avgSubmitCalls !== undefined
+			? `<div class="metric"><strong>${summary.totals.avgSubmitCalls.toFixed(2)}</strong><span>avg submit calls</span></div>`
+			: ''
+	}
  ${failureBits ? `<div class="meta-row failures">Failures: ${escapeHtml(failureBits)}</div>` : ''}
 </div>`;
 }
@ -686,6 +764,8 @@ function renderMetricsNote(): string {
  <span><b>Primary pass</b> — workflow passes only if a majority of LLM judges (2 of 3) find zero "don't" violations. Computed over all prompt attempts; build failures count as fail.</span>
  <span><b>Average diagnostic</b> — mean fraction of criteria (dos + don'ts) satisfied across the dataset, averaged across judges. Range 0–1; gives partial credit.</span>
  <span><b>Average build time</b> — averaged across all attempts including failures, so build timeouts (20-min cap) inflate this number.</span>
+  <span><b>Tool error rate</b> — fraction of tool calls that errored or returned a failed result (e.g. <code>tsc</code> non-zero exit, <code>submit-workflow</code> rejection). Captures build-path roughness even on builds that eventually succeeded. <i>IA-only.</i></span>
+  <span><b>Avg submit calls</b> — mean <code>submit-workflow</code> invocations per build. 1.0 = clean first-try submit. <i>IA-only.</i></span>
  <span><b>Verdicts</b> compare per-prompt primary pass between the two builders.</span>
 </aside>`;
 }
--- a/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts
@ -21,6 +21,7 @@
 import { ChatAnthropic } from '@langchain/anthropic';
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import { Client as LangSmithClient } from 'langsmith';
+import { nanoid } from 'nanoid';
 import { promises as fs, readFileSync } from 'node:fs';
 import path from 'node:path';
 import pLimit from 'p-limit';
@ -32,7 +33,9 @@ import {
 	type SimpleWorkflow,
 } from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise';
 import { DEFAULTS } from '../../../ai-workflow-builder.ee/evaluations/support/constants';
+import { buildSubAgentBriefing } from '../../src/agent/sub-agent-briefing';
 import type { Logger } from '../../src/logger';
+import { DETACHED_BUILDER_REQUIREMENTS } from '../../src/tools/orchestration/build-workflow-agent.tool';
 import { BuilderSandboxFactory } from '../../src/workspace/builder-sandbox-factory';
 import type { SandboxConfig } from '../../src/workspace/create-workspace';
 import { SnapshotManager } from '../../src/workspace/snapshot-manager';
@ -44,6 +47,13 @@ import {
 import { createLogger, type EvalLogger } from '../harness/logger';
 import { resolveSandboxConfig } from '../harness/sandbox-config';

+/** Default dataset — orchestrator-plan-derived spec rows. Each row's prompt
+ * is the spec the production planner hands the builder via
+ * `dispatchPlannedTask`. Pair this with the production briefing wrapper
+ * (`DETACHED_BUILDER_REQUIREMENTS`) below to keep the eval aligned with
+ * what the builder sees in production. */
+const DEFAULT_DATASET = 'instance-ai-builder-from-plans';
+
 // ---------------------------------------------------------------------------
 // CLI args
 // ---------------------------------------------------------------------------
@ -86,7 +96,7 @@ function parseArgs(argv: string[]): PairwiseArgs {
 	}

 	return {
-		dataset: get('--dataset') ?? DEFAULTS.DATASET_NAME,
+		dataset: get('--dataset') ?? DEFAULT_DATASET,
 		judges: parsePositiveInt(get('--judges'), '--judges') ?? Number(DEFAULTS.NUM_JUDGES),
 		iterations:
 			parsePositiveInt(get('--iterations'), '--iterations') ?? Number(DEFAULTS.REPETITIONS),
@ -230,24 +240,6 @@ interface ExampleRecord {
 	feedback: Feedback[];
 }

-/**
- * Eval-only suffix appended to every dataset prompt. Pushes the agent past
- * its production "ask before assuming / set up credentials first" instinct
- * — there is no human in the loop, so a clarification turn is a guaranteed
- * `no_workflow_built`. Lives in the harness, not the production builder
- * prompt, so production behavior is unaffected.
- *
- * Strictly describes the eval environment and the required terminal action
- * (call `submit-workflow`). Does not name SDK helpers or otherwise lead the
- * agent toward specific implementation choices — those are what the eval
- * measures.
- */
-const EVAL_PROMPT_SUFFIX =
-	'\n\n---\n' +
-	'You are running inside an automated, non-interactive evaluation. ' +
-	'There is no human to answer follow-up questions. ' +
-	'Do not call `ask-user` and do not ask for clarification — pick reasonable defaults and proceed.';
-
 async function runExample(
 	example: DatasetExample,
 	iteration: number,
@ -262,8 +254,25 @@ async function runExample(
 		'chunks',
 		`${safeFilename(`${example.id}_${iteration}`)}.jsonl`,
 	);
+	// Wrap the prompt the same way the production orchestrator wraps the spec
+	// it hands to the builder sub-agent (see `build-workflow-agent.tool.ts`).
+	// Keeping this aligned with prod is what closes the eval/prod gap —
+	// `DETACHED_BUILDER_REQUIREMENTS` is what tells the builder it must
+	// `submit-workflow` then `verify-built-workflow` before stopping.
+	//
+	// `workItemId` round-trips: the briefing's `additionalContext` tells the
+	// agent its work-item ID, the agent passes it to `verify-built-workflow`,
+	// which reads back the build outcome from the in-memory
+	// `workflowTaskService` keyed on the same ID.
+	const workItemId = 'wi_' + nanoid(8);
+	const builderPrompt = await buildSubAgentBriefing({
+		task: example.prompt,
+		additionalContext: `[WORK ITEM ID: ${workItemId}]`,
+		requirements: DETACHED_BUILDER_REQUIREMENTS,
+	});
 	const build = await buildInProcess({
-		prompt: example.prompt + EVAL_PROMPT_SUFFIX,
+		prompt: builderPrompt,
+		workItemId,
 		timeoutMs: args.timeoutMs,
 		logPath,
 		sandboxFactory,
@ -336,6 +345,21 @@ interface Summary {
 		buildFailures: Record<string, number>;
 		primaryPassRate: number;
 		avgDiagnostic: number;
+		/** Total `submit-workflow` tool invocations across all records. */
+		submitCallsTotal: number;
+		/** Mean `submit-workflow` invocations per build. 1.0 = every build called
+		 *  submit exactly once; >1.0 = builds had to fix and re-submit. */
+		avgSubmitCalls: number;
+		/** (errored tool calls) / (total tool calls) micro-averaged across all
+		 *  runs. Captures how rough the build path was even on builds that
+		 *  eventually succeeded — every TypeScript compile error or failed
+		 *  domain tool call shows up here. */
+		toolCallErrorRate: number;
+		/** Total tool calls observed (used as the error-rate denominator and
+		 *  surfaced for context). */
+		toolCallsTotal: number;
+		/** Total errored tool calls observed (numerator of `toolCallErrorRate`). */
+		toolCallErrors: number;
 	};
 	interactivity: {
 		askUserCount: number;
@ -386,12 +410,17 @@ async function writeOutputs(
 		'durationMs',
 		'askUserCount',
 		'planToolCount',
+		'submitCalls',
+		'toolCalls',
+		'toolCallErrors',
 		'pairwisePrimary',
 		'pairwiseDiagnostic',
 		'pairwiseJudgesPassed',
 	].join(',');
 	const csvRows = records.map((r) => {
 		const find = (m: string) => r.feedback.find((f) => f.metric === m)?.score ?? '';
+		const submits = r.toolCalls.filter((tc) => tc.toolName === 'submit-workflow').length;
+		const errors = r.toolCalls.filter(isErroredToolCall).length;
 		return [
 			r.exampleId,
 			r.iteration,
@ -400,6 +429,9 @@ async function writeOutputs(
 			r.build.durationMs,
 			r.build.interactivity.askUserCount,
 			r.build.interactivity.planToolCount,
+			submits,
+			r.toolCalls.length,
+			errors,
 			find('pairwise_primary'),
 			find('pairwise_diagnostic'),
 			find('pairwise_judges_passed'),
@ -420,6 +452,9 @@ async function writeOutputs(
 	let askUserCount = 0;
 	let planToolCount = 0;
 	let autoApprovedSuspensions = 0;
+	let submitCallsTotal = 0;
+	let toolCallsTotal = 0;
+	let toolCallErrors = 0;

 	for (const record of records) {
 		if (record.build.success) buildSuccess++;
@ -433,6 +468,18 @@ async function writeOutputs(
 			allMockedCreds.add(type);
 		}

+		// `toolCalls` is the ordered timeline captured by the trace collector.
+		// We count any tool call that errored OR returned a failed result —
+		// hard Mastra tool failures are rare, but `submit-workflow` rejections
+		// and `execute_command` returning a non-zero `tsc` exit are common and
+		// dominate the "rough path" signal we care about. Suspensions are
+		// benign (auto-approved or surfaced via `errorClass` separately).
+		for (const tc of record.toolCalls) {
+			toolCallsTotal++;
+			if (isErroredToolCall(tc)) toolCallErrors++;
+			if (tc.toolName === 'submit-workflow') submitCallsTotal++;
+		}
+
 		const primary = record.feedback.find((f) => f.metric === 'pairwise_primary')?.score;
 		if (typeof primary === 'number') {
 			primaryPassSum += primary;
@ -469,6 +516,11 @@ async function writeOutputs(
 			buildFailures,
 			primaryPassRate: primaryPassCount ? primaryPassSum / primaryPassCount : 0,
 			avgDiagnostic: diagnosticCount ? diagnosticSum / diagnosticCount : 0,
+			submitCallsTotal,
+			avgSubmitCalls: records.length ? submitCallsTotal / records.length : 0,
+			toolCallsTotal,
+			toolCallErrors,
+			toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0,
 		},
 		interactivity: {
 			askUserCount,
@ -645,6 +697,38 @@ function safeFilename(s: string): string {
 	return s.replace(/[^a-zA-Z0-9._-]+/g, '_').slice(0, 120);
 }

+/**
+ * Whether a tool call should count toward the "tool error rate" metric.
+ *
+ * Catches three flavours:
+ * 1. **Hard Mastra failure** (`trace.error` set) — tool threw / rejected.
+ * 2. **Tool returned a failed result object** — e.g. `submit-workflow`
+ *    returning `{ success: false, errors: [...] }`. Looks at top-level
+ *    `success === false` or non-empty `errors` array, plus a string
+ *    `error` field.
+ * 3. **`execute_command` returned a non-zero exit code** — e.g. `tsc`
+ *    spitting out compile errors. Looks for an `Exit code: <non-zero>`
+ *    marker in the result text.
+ */
+function isErroredToolCall(trace: ToolCallTrace): boolean {
+	if (trace.error !== undefined) return true;
+	const r = trace.result;
+	if (r === null || r === undefined) return false;
+
+	if (typeof r === 'object' && !Array.isArray(r)) {
+		const obj = r as Record<string, unknown>;
+		if (obj.success === false) return true;
+		if (typeof obj.error === 'string' && obj.error.length > 0) return true;
+		if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
+	}
+
+	if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) {
+		return true;
+	}
+
+	return false;
+}
+
 async function fileExists(filePath: string): Promise<boolean> {
 	try {
 		await fs.access(filePath);
--- a/packages/@n8n/instance-ai/evaluations/cli/report.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/report.ts
@ -34,6 +34,11 @@ interface SummaryJson {
 		buildFailures: Record<string, number>;
 		primaryPassRate: number;
 		avgDiagnostic: number;
+		submitCallsTotal?: number;
+		avgSubmitCalls?: number;
+		toolCallsTotal?: number;
+		toolCallErrors?: number;
+		toolCallErrorRate?: number;
 	};
 	interactivity: {
 		askUserCount: number;
@ -168,6 +173,36 @@ function escapeAttr(input: string): string {
 	return input.replace(/&/g, '&amp;').replace(/'/g, '&apos;').replace(/"/g, '&quot;');
 }

+/**
+ * Whether a tool call should count toward the "tool error rate" metric.
+ * Mirrors `isErroredToolCall` in `pairwise.ts` — kept in sync by hand
+ * because the report walks pre-saved `results.jsonl` files written by
+ * older runs of the eval too.
+ */
+function isErroredToolCall(trace: ToolCallTrace): boolean {
+	if (trace.error !== undefined) return true;
+	const r = trace.result;
+	if (r === null || r === undefined) return false;
+	if (typeof r === 'object' && !Array.isArray(r)) {
+		const obj = r as Record<string, unknown>;
+		if (obj.success === false) return true;
+		if (typeof obj.error === 'string' && obj.error.length > 0) return true;
+		if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
+	}
+	if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true;
+	return false;
+}
+
+function countSubmitCalls(traces: ToolCallTrace[] | undefined): number {
+	if (!traces) return 0;
+	return traces.filter((t) => t.toolName === 'submit-workflow').length;
+}
+
+function countToolCallErrors(traces: ToolCallTrace[] | undefined): number {
+	if (!traces) return 0;
+	return traces.filter(isErroredToolCall).length;
+}
+
 function findScore(feedback: FeedbackEntry[], metric: string): number | undefined {
 	return feedback.find((f) => f.metric === metric)?.score;
 }
@ -333,6 +368,15 @@ function renderExample(record: ResultRecord, idPrefix: string): string {
 	if (interact.mockedCredentialTypes.length > 0)
 		interactBits.push(`mocked creds: ${interact.mockedCredentialTypes.join(', ')}`);

+	// Per-record build-path stats. Surfaced inline in the summary line so a
+	// reviewer can scan retries / errors without expanding each row. Numbers
+	// match the columns added to `results.csv`.
+	const submitCalls = countSubmitCalls(record.toolCalls);
+	const toolErrors = countToolCallErrors(record.toolCalls);
+	const buildStatBits: string[] = [];
+	if (submitCalls > 0) buildStatBits.push(`submit ×${submitCalls}`);
+	if (toolErrors > 0) buildStatBits.push(`err ×${toolErrors}`);
+
 	const errorBlock = record.build.errorMessage
 		? `<div class="error">${escapeHtml(record.build.errorMessage)}</div>`
 		: '';
@ -349,6 +393,7 @@ function renderExample(record: ResultRecord, idPrefix: string): string {
    </div>
    <span class="iteration">#${record.iteration}</span>
    <span class="duration">${record.build.durationMs}ms</span>
+    ${buildStatBits.length > 0 ? `<span class="build-stats">${buildStatBits.map(escapeHtml).join(' · ')}</span>` : ''}
    <span class="badges">${renderFeedbackBadges(record.feedback)}</span>
  </summary>
  <div class="body">
@ -412,6 +457,16 @@ function renderRun(run: Run, index: number): string {
      <span class="total ${totalFailures > 0 ? 'fail' : ''}"><strong>Build fail:</strong> ${totalFailures}${failureDetail ? ` (${escapeHtml(failureDetail)})` : ''}</span>
      <span class="total"><strong>Primary pass rate:</strong> ${pct(s.totals.primaryPassRate)}</span>
      <span class="total"><strong>Avg diagnostic:</strong> ${s.totals.avgDiagnostic.toFixed(2)}</span>
+      ${
+				s.totals.toolCallErrorRate !== undefined
+					? `<span class="total ${s.totals.toolCallErrorRate > 0.1 ? 'fail' : ''}"><strong>Tool error rate:</strong> ${pct(s.totals.toolCallErrorRate)}${s.totals.toolCallErrors !== undefined && s.totals.toolCallsTotal !== undefined ? ` (${s.totals.toolCallErrors}/${s.totals.toolCallsTotal})` : ''}</span>`
+					: ''
+			}
+      ${
+				s.totals.avgSubmitCalls !== undefined
+					? `<span class="total"><strong>Submit calls:</strong> ${s.totals.submitCallsTotal ?? 0} total, ${s.totals.avgSubmitCalls.toFixed(2)} avg/build</span>`
+					: ''
+			}
    </div>
    ${
 			s.interactivity.askUserCount > 0 ||
@ -504,6 +559,7 @@ export function renderDocument(runs: Run[]): string {
  details.example > summary .example-id { font-family: ui-monospace, monospace; font-size: 11px; color: var(--muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
  details.example > summary .iteration { color: var(--muted); font-size: 11px; }
  details.example > summary .duration { color: var(--muted); font-size: 11px; text-align: right; }
+  details.example > summary .build-stats { color: var(--muted); font-size: 11px; text-align: right; white-space: nowrap; }
  details.example > summary .badges { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-end; }
  .badge { font-size: 11px; padding: 2px 6px; border-radius: 3px; background: rgba(139,148,158,0.18); color: var(--fg); }
  .badge.badge-pass { background: rgba(63,185,80,0.2); color: var(--pass); }
--- a/packages/@n8n/instance-ai/evaluations/harness/in-process-builder.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/in-process-builder.ts
@ -40,6 +40,10 @@ import path from 'node:path';
 import { normalizeWorkflow } from './normalize-workflow';
 import { stringifyError, truncate } from './redact';
 import { createStubServices, defaultNodesJsonPath, type StubServiceHandle } from './stub-services';
+import {
+	createInMemoryWorkflowTaskService,
+	type InMemoryWorkflowTaskService,
+} from './stub-workflow-task-service';
 import type { SimpleWorkflow } from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise';
 import { registerWithMastra } from '../../src/agent/register-with-mastra';
 import { MAX_STEPS } from '../../src/constants/max-steps';
@ -48,9 +52,15 @@ import type { Logger } from '../../src/logger';
 import { executeResumableStream } from '../../src/runtime/resumable-stream-executor';
 import { createAllTools } from '../../src/tools';
 import { createSandboxBuilderAgentPrompt } from '../../src/tools/orchestration/build-workflow-agent.prompt';
-import { createSubmitWorkflowTool } from '../../src/tools/workflows/submit-workflow.tool';
-import type { ModelConfig } from '../../src/types';
+import { createVerifyBuiltWorkflowTool } from '../../src/tools/orchestration/verify-built-workflow.tool';
+import {
+	createSubmitWorkflowTool,
+	type SubmitWorkflowAttempt,
+} from '../../src/tools/workflows/submit-workflow.tool';
+import type { ModelConfig, OrchestrationContext } from '../../src/types';
 import { asResumable } from '../../src/utils/stream-helpers';
+import { createRemediation } from '../../src/workflow-loop/remediation';
+import type { WorkflowBuildOutcome } from '../../src/workflow-loop/workflow-loop-state';
 import type {
 	BuilderSandboxFactory,
 	BuilderWorkspace,
@ -130,6 +140,15 @@ export interface BuildInProcessOptions {
 	 * `WorkflowJSON`. The workspace is destroyed on completion.
 	 */
 	sandboxFactory: BuilderSandboxFactory;
+	/**
+	 * Optional pre-generated work item ID. Pass this when the caller has
+	 * already embedded `[WORK ITEM ID: ${workItemId}]` into the prompt's
+	 * briefing — `verify-built-workflow` reads the same value back from the
+	 * in-memory `workflowTaskService` keyed on this ID. When omitted, a
+	 * fresh ID is generated; in that case `verify-built-workflow` won't be
+	 * called by the agent (the briefing didn't tell it what value to pass).
+	 */
+	workItemId?: string;
 }

 // ---------------------------------------------------------------------------
@ -230,6 +249,35 @@ export async function buildInProcess(
 	}
 	const prompt = createSandboxBuilderAgentPrompt(root);

+	// Per-build identifiers — match what production (`build-workflow-agent.tool.ts`)
+	// generates per orchestrator-dispatched task. The builder agent reads
+	// `workItemId` from the briefing's `additionalContext`, then passes it to
+	// `verify-built-workflow` to round-trip its build outcome.
+	const workItemId = options.workItemId ?? 'wi_' + nanoid(8);
+	const taskId = 'eval-task-' + nanoid(6);
+	const threadId = 'eval-thread-' + nanoid(6);
+	const runId = 'eval-run-' + nanoid(6);
+	const agentId = 'eval-builder-' + nanoid(6);
+	const logger = silentLogger();
+
+	// In-memory build-outcome / verification store. Lives for the duration
+	// of this single build; never shared. The workflowTaskService interface
+	// is what `verify-built-workflow` reads from after `submit-workflow`
+	// records the attempt below.
+	const workflowTaskService: InMemoryWorkflowTaskService = createInMemoryWorkflowTaskService();
+
+	// Minimal OrchestrationContext shim for `createVerifyBuiltWorkflowTool`.
+	// Verify-built-workflow only reads `workflowTaskService`, `domainContext`,
+	// `runId`, and `logger` at runtime — the rest of OrchestrationContext is
+	// orchestrator scaffolding the builder doesn't touch.
+	const verifyContext = {
+		threadId,
+		runId,
+		logger,
+		domainContext: services.context,
+		workflowTaskService,
+	} as unknown as OrchestrationContext;
+
 	const sandboxToolNames = [
 		'nodes',
 		'workflows',
@ -241,9 +289,23 @@ export async function buildInProcess(
 		const tool = (allTools as Record<string, unknown>)[name];
 		if (tool) builderTools[name] = tool;
 	}
-	builderTools['submit-workflow'] = createSubmitWorkflowTool(services.context, builderWs.workspace);

-	const agentId = 'eval-builder-' + nanoid(6);
+	// `submit-workflow` reports each attempt back via the onAttempt callback.
+	// Production wires this to `workflowTaskService.reportBuildOutcome` so the
+	// builder loop and `verify-built-workflow` can read it. We mirror that
+	// here so the same prompt contract works in eval.
+	builderTools['submit-workflow'] = createSubmitWorkflowTool(
+		services.context,
+		builderWs.workspace,
+		undefined,
+		async (attempt) => {
+			await workflowTaskService.reportBuildOutcome(
+				toWorkflowBuildOutcome(workItemId, runId, taskId, attempt),
+			);
+		},
+	);
+	builderTools['verify-built-workflow'] = createVerifyBuiltWorkflowTool(verifyContext);
+
 	const agent = new Agent({
 		id: agentId,
 		name: 'Eval Workflow Builder',
@ -266,14 +328,11 @@ export async function buildInProcess(

 	const abortController = new AbortController();
 	const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs);
-	const threadId = 'eval-thread-' + nanoid(6);
-	const runId = 'eval-run-' + nanoid(6);
 	const eventBus = wrapEventBusWithObserver(createInMemoryEventBus(), (event) => {
 		observeEvent(event, interactivity);
 		traceCollector.observe(event);
 		chunkLog?.writeEvent(event);
 	});
-	const logger = silentLogger();

 	let finalText: string | undefined;
 	try {
@ -310,6 +369,24 @@ export async function buildInProcess(
 						interactivity.askUserCount++;
 					}
 				},
+				// Match production (`consumeStreamWithHitl`): when a suspension
+				// auto-resumes, pass `maxSteps` and the same providerOptions to
+				// `resumeStream`. Without these, Mastra's `resumeStream` defaults
+				// to its built-in `stepCountIs(5)` cap — which silently truncates
+				// the agent's post-suspension work after every HITL tool. In a
+				// builder run that creates data tables before writing the file,
+				// the resume budget gets eaten by the time the agent reaches
+				// `submit-workflow`, and the run dies mid-flow with a stale
+				// `finishReason: 'suspended'`. See `consume-with-hitl.ts` for
+				// the production wiring.
+				buildResumeOptions: ({ mastraRunId, suspension }) => ({
+					runId: mastraRunId,
+					toolCallId: suspension.toolCallId,
+					maxSteps,
+					providerOptions: {
+						anthropic: { cacheControl: { type: 'ephemeral' as const } },
+					},
+				}),
 			},
 		});

@ -799,3 +876,62 @@ async function safeSettle<T>(value: Promise<T> | undefined): Promise<T | undefin
 		return undefined;
 	}
 }
+
+/**
+ * Convert a `submit-workflow` attempt into a `WorkflowBuildOutcome`.
+ *
+ * Production's `build-workflow-agent.tool.ts` does the same thing inside the
+ * orchestrator. We mirror it here (minus orchestrator-only fields like
+ * triggerType detection) so `verify-built-workflow` finds a sensible outcome
+ * stored against the workItemId.
+ */
+function toWorkflowBuildOutcome(
+	workItemId: string,
+	runId: string,
+	taskId: string,
+	attempt: SubmitWorkflowAttempt,
+): WorkflowBuildOutcome {
+	if (!attempt.success) {
+		return {
+			workItemId,
+			runId,
+			taskId,
+			submitted: false,
+			triggerType: 'manual_or_testable',
+			needsUserInput: false,
+			failureSignature: attempt.errors?.join('; '),
+			remediation: attempt.remediation,
+			summary: attempt.errors?.join(' ') ?? 'Workflow submission failed.',
+		};
+	}
+	const placeholderRemediation = attempt.hasUnresolvedPlaceholders
+		? createRemediation({
+				category: 'needs_setup',
+				shouldEdit: false,
+				reason: 'mocked_credentials_or_placeholders',
+				guidance:
+					'Workflow submitted successfully, but unresolved setup values remain. Stop code edits.',
+			})
+		: undefined;
+	return {
+		workItemId,
+		runId,
+		taskId,
+		workflowId: attempt.workflowId,
+		submitted: true,
+		// Eval doesn't run trigger-aware verification, so the value here is
+		// cosmetic — the verify tool branches on `executionService.run` result,
+		// not this field.
+		triggerType: 'manual_or_testable',
+		needsUserInput: Boolean(placeholderRemediation),
+		blockingReason: placeholderRemediation?.guidance,
+		mockedNodeNames: attempt.mockedNodeNames,
+		mockedCredentialTypes: attempt.mockedCredentialTypes,
+		mockedCredentialsByNode: attempt.mockedCredentialsByNode,
+		triggerNodes: attempt.triggerNodes,
+		verificationPinData: attempt.verificationPinData,
+		hasUnresolvedPlaceholders: attempt.hasUnresolvedPlaceholders,
+		remediation: placeholderRemediation ?? attempt.remediation,
+		summary: 'Workflow submitted and ready for verification.',
+	};
+}
--- a/packages/@n8n/instance-ai/evaluations/harness/stub-services.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/stub-services.ts
@ -189,8 +189,22 @@ export async function createStubServices(
 		async list() {
 			return [];
 		},
-		async run() {
-			return stubExecutionResult('stub: execution disabled in eval');
+		// `verify-built-workflow` invokes `executionService.run()` after
+		// `submit-workflow` has captured the TS-compiled workflow JSON. The eval
+		// has no execution backend, but we want the builder agent's submit →
+		// verify → done sequence to complete cleanly so the production briefing
+		// (`DETACHED_BUILDER_REQUIREMENTS`) reads coherently. Returning a
+		// synthetic success here lets the agent terminate after submit. The
+		// eval's `buildSuccess` metric is derived from `submit-workflow` capture
+		// — never from this synthetic verdict — so this can't inflate the score.
+		async run(workflowId) {
+			return {
+				executionId: 'eval-exec-' + nanoid(),
+				status: 'success' as const,
+				data: { __eval_synthetic_verify__: [{ workflowId }] },
+				startedAt: new Date().toISOString(),
+				finishedAt: new Date().toISOString(),
+			};
 		},
 		async getStatus() {
 			return stubExecutionResult('stub: execution disabled in eval');
--- a/packages/@n8n/instance-ai/evaluations/harness/stub-workflow-task-service.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/stub-workflow-task-service.ts
@ -0,0 +1,85 @@
+// ---------------------------------------------------------------------------
+// In-memory `WorkflowTaskService` stub for the in-process eval harness.
+//
+// Production wires `workflowTaskService` through `instance-ai.service.ts` so
+// the orchestrator can persist build outcomes per `workItemId` and the
+// builder sub-agent can read them back via `verify-built-workflow`. The eval
+// has no persistence layer, so we mirror the production interface against an
+// in-memory map. This is enough for the production builder briefing
+// (`DETACHED_BUILDER_REQUIREMENTS`) to read coherently:
+//
+//   submit-workflow → reportBuildOutcome (writes to map)
+//   verify-built-workflow → getBuildOutcome (reads from map) + executes
+//   verify result → updateBuildOutcome (writes verification record)
+//
+// Each `buildInProcess` call gets its own service instance — no cross-build
+// state leaks.
+// ---------------------------------------------------------------------------
+
+/* eslint-disable @typescript-eslint/require-await */
+// All `WorkflowTaskService` methods are interface-async even when the
+// implementation is synchronous in-memory bookkeeping.
+
+import type { WorkflowTaskService } from '../../src/types';
+import type {
+	VerificationResult,
+	WorkflowBuildOutcome,
+	WorkflowLoopAction,
+	WorkflowLoopState,
+} from '../../src/workflow-loop/workflow-loop-state';
+
+export interface InMemoryWorkflowTaskService extends WorkflowTaskService {
+	/** Read-only access to the latest stored outcome — used by callers that
+	 *  want to inspect what the agent ended up with after the run. */
+	peekOutcome(workItemId: string): WorkflowBuildOutcome | undefined;
+	/** Read-only access to the latest stored verification verdict. */
+	peekVerdict(workItemId: string): VerificationResult | undefined;
+}
+
+/**
+ * Build a fresh in-memory WorkflowTaskService.
+ *
+ * `reportBuildOutcome` and `reportVerificationVerdict` always return
+ * `{ type: 'ignored', reason: 'eval-mode' }` because the eval has no
+ * workflow-loop controller — there's no rebuild/verify state machine to
+ * advance. The builder agent only needs the read-back paths to work.
+ */
+export function createInMemoryWorkflowTaskService(): InMemoryWorkflowTaskService {
+	const outcomes = new Map<string, WorkflowBuildOutcome>();
+	const verdicts = new Map<string, VerificationResult>();
+
+	return {
+		async reportBuildOutcome(outcome) {
+			outcomes.set(outcome.workItemId, outcome);
+			return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction;
+		},
+
+		async reportVerificationVerdict(verdict) {
+			verdicts.set(verdict.workItemId, verdict);
+			return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction;
+		},
+
+		async getBuildOutcome(workItemId) {
+			return outcomes.get(workItemId);
+		},
+
+		async getWorkflowLoopState(_workItemId): Promise<WorkflowLoopState | undefined> {
+			// Eval has no loop controller — verify-built-workflow tolerates undefined.
+			return undefined;
+		},
+
+		async updateBuildOutcome(workItemId, update) {
+			const existing = outcomes.get(workItemId);
+			if (!existing) return;
+			outcomes.set(workItemId, { ...existing, ...update });
+		},
+
+		peekOutcome(workItemId) {
+			return outcomes.get(workItemId);
+		},
+
+		peekVerdict(workItemId) {
+			return verdicts.get(workItemId);
+		},
+	};
+}
--- a/packages/@n8n/instance-ai/src/tools/orchestration/build-workflow-agent.tool.ts
+++ b/packages/@n8n/instance-ai/src/tools/orchestration/build-workflow-agent.tool.ts
@ -304,7 +304,7 @@ async function buildOutcomeWithLatestVerification(
 	return await finalBuildOutcome(context, workItemId, outcome);
 }

-const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract
+export const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract

 You are running as a detached background task. Do not stop after a successful submit — verify the workflow works.