diff --git a/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts b/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts index 3cdd046d879..fafbeb99db6 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts @@ -43,6 +43,13 @@ export interface BuilderRecord { feedback: FeedbackEntry[]; tokenInput?: number; tokenOutput?: number; + /** Number of `submit-workflow` calls during the build. IA-only — EE + * doesn't capture a tool-call timeline in the comparable shape. */ + submitCalls?: number; + /** Number of tool calls that errored or returned a failed result. */ + toolCallErrors?: number; + /** Total tool calls observed, used as the error-rate denominator. */ + toolCallsTotal?: number; } interface BuilderSummary { @@ -59,6 +66,17 @@ interface BuilderSummary { primaryPassRate: number; avgDiagnostic: number; avgDurationMs: number; + /** Total `submit-workflow` calls aggregated across IA records. Undefined + * for EE (which doesn't capture a comparable tool-call timeline). */ + submitCallsTotal?: number; + /** Mean `submit-workflow` calls per record (IA only). */ + avgSubmitCalls?: number; + /** Total tool calls observed across IA records. */ + toolCallsTotal?: number; + /** Total errored tool calls observed across IA records. */ + toolCallErrors?: number; + /** `toolCallErrors / toolCallsTotal` micro-averaged. IA-only. */ + toolCallErrorRate?: number; }; } @@ -71,6 +89,16 @@ interface BuilderRun { // Instance AI loader (writes results.jsonl + workflows/.json + summary.json) // --------------------------------------------------------------------------- +interface IAToolCallTrace { + step: number; + toolCallId: string; + toolName: string; + args?: unknown; + result?: unknown; + error?: string; + elapsedMs?: number; +} + interface IAResultRecord { exampleId: string; iteration: number; @@ -86,6 +114,25 @@ interface IAResultRecord { tokenUsage?: { input?: number; output?: number }; }; feedback: Array<{ metric: string; score: number; kind?: string; comment?: string }>; + toolCalls?: IAToolCallTrace[]; +} + +/** + * Whether a tool call should count toward the "tool error rate" metric. + * Mirrors `isErroredToolCall` in `pairwise.ts`. + */ +function isErroredIAToolCall(trace: IAToolCallTrace): boolean { + if (trace.error !== undefined) return true; + const r = trace.result; + if (r === null || r === undefined) return false; + if (typeof r === 'object' && !Array.isArray(r)) { + const obj = r as Record; + if (obj.success === false) return true; + if (typeof obj.error === 'string' && obj.error.length > 0) return true; + if (Array.isArray(obj.errors) && obj.errors.length > 0) return true; + } + if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true; + return false; } interface IASummary { @@ -125,20 +172,26 @@ async function loadInstanceAiRun(dir: string): Promise { // Use only iteration 1 for a fair 1:1 comparison. .filter((r) => r.iteration === 1); - const normalized: BuilderRecord[] = records.map((r) => ({ - prompt: r.prompt, - exampleId: r.exampleId, - dos: r.dos, - donts: r.donts, - workflow: r.workflow, - durationMs: r.build.durationMs, - success: r.build.success, - errorClass: r.build.errorClass, - errorMessage: r.build.errorMessage, - feedback: r.feedback, - tokenInput: r.build.tokenUsage?.input, - tokenOutput: r.build.tokenUsage?.output, - })); + const normalized: BuilderRecord[] = records.map((r) => { + const tcs = r.toolCalls ?? []; + return { + prompt: r.prompt, + exampleId: r.exampleId, + dos: r.dos, + donts: r.donts, + workflow: r.workflow, + durationMs: r.build.durationMs, + success: r.build.success, + errorClass: r.build.errorClass, + errorMessage: r.build.errorMessage, + feedback: r.feedback, + tokenInput: r.build.tokenUsage?.input, + tokenOutput: r.build.tokenUsage?.output, + submitCalls: tcs.filter((tc) => tc.toolName === 'submit-workflow').length, + toolCallErrors: tcs.filter(isErroredIAToolCall).length, + toolCallsTotal: tcs.length, + }; + }); const avgDuration = normalized.length === 0 @@ -166,6 +219,10 @@ async function loadInstanceAiRun(dir: string): Promise { ? 0 : diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length; + const submitCallsTotal = normalized.reduce((s, r) => s + (r.submitCalls ?? 0), 0); + const toolCallsTotal = normalized.reduce((s, r) => s + (r.toolCallsTotal ?? 0), 0); + const toolCallErrors = normalized.reduce((s, r) => s + (r.toolCallErrors ?? 0), 0); + return { summary: { label: `${summary.builder} (instance-ai)`, @@ -181,6 +238,11 @@ async function loadInstanceAiRun(dir: string): Promise { primaryPassRate, avgDiagnostic, avgDurationMs: avgDuration, + submitCallsTotal, + avgSubmitCalls: normalized.length ? submitCallsTotal / normalized.length : 0, + toolCallsTotal, + toolCallErrors, + toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0, }, }, records: normalized, @@ -564,6 +626,12 @@ function renderBuilderColumn(label: string, record: BuilderRecord | undefined): if (record.tokenInput !== undefined && record.tokenOutput !== undefined) { metaParts.push(`${record.tokenInput}+${record.tokenOutput} tok`); } + if (record.submitCalls !== undefined && record.submitCalls > 0) { + metaParts.push(`submit ×${record.submitCalls}`); + } + if (record.toolCallErrors !== undefined && record.toolCallErrors > 0) { + metaParts.push(`err ×${record.toolCallErrors}`); + } const errorBlock = record.errorMessage ? `
${escapeHtml(record.errorMessage)}
` @@ -676,6 +744,16 @@ function renderSummaryCard(
${summary.totals.avgDiagnostic.toFixed(2)}avg diagnostic
${formatDuration(summary.totals.avgDurationMs)}avg build time
${summary.totals.buildSuccess}/${totalRecords}built ok
+ ${ + summary.totals.toolCallErrorRate !== undefined + ? `
${pct(summary.totals.toolCallErrorRate)}tool error rate (${summary.totals.toolCallErrors ?? 0}/${summary.totals.toolCallsTotal ?? 0})
` + : '' + } + ${ + summary.totals.avgSubmitCalls !== undefined + ? `
${summary.totals.avgSubmitCalls.toFixed(2)}avg submit calls
` + : '' + } ${failureBits ? `
Failures: ${escapeHtml(failureBits)}
` : ''} `; } @@ -686,6 +764,8 @@ function renderMetricsNote(): string { Primary pass — workflow passes only if a majority of LLM judges (2 of 3) find zero "don't" violations. Computed over all prompt attempts; build failures count as fail. Average diagnostic — mean fraction of criteria (dos + don'ts) satisfied across the dataset, averaged across judges. Range 0–1; gives partial credit. Average build time — averaged across all attempts including failures, so build timeouts (20-min cap) inflate this number. + Tool error rate — fraction of tool calls that errored or returned a failed result (e.g. tsc non-zero exit, submit-workflow rejection). Captures build-path roughness even on builds that eventually succeeded. IA-only. + Avg submit calls — mean submit-workflow invocations per build. 1.0 = clean first-try submit. IA-only. Verdicts compare per-prompt primary pass between the two builders. `; } diff --git a/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts b/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts index a5977e51f68..3954e52a4b0 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts @@ -21,6 +21,7 @@ import { ChatAnthropic } from '@langchain/anthropic'; import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; import { Client as LangSmithClient } from 'langsmith'; +import { nanoid } from 'nanoid'; import { promises as fs, readFileSync } from 'node:fs'; import path from 'node:path'; import pLimit from 'p-limit'; @@ -32,7 +33,9 @@ import { type SimpleWorkflow, } from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise'; import { DEFAULTS } from '../../../ai-workflow-builder.ee/evaluations/support/constants'; +import { buildSubAgentBriefing } from '../../src/agent/sub-agent-briefing'; import type { Logger } from '../../src/logger'; +import { DETACHED_BUILDER_REQUIREMENTS } from '../../src/tools/orchestration/build-workflow-agent.tool'; import { BuilderSandboxFactory } from '../../src/workspace/builder-sandbox-factory'; import type { SandboxConfig } from '../../src/workspace/create-workspace'; import { SnapshotManager } from '../../src/workspace/snapshot-manager'; @@ -44,6 +47,13 @@ import { import { createLogger, type EvalLogger } from '../harness/logger'; import { resolveSandboxConfig } from '../harness/sandbox-config'; +/** Default dataset — orchestrator-plan-derived spec rows. Each row's prompt + * is the spec the production planner hands the builder via + * `dispatchPlannedTask`. Pair this with the production briefing wrapper + * (`DETACHED_BUILDER_REQUIREMENTS`) below to keep the eval aligned with + * what the builder sees in production. */ +const DEFAULT_DATASET = 'instance-ai-builder-from-plans'; + // --------------------------------------------------------------------------- // CLI args // --------------------------------------------------------------------------- @@ -86,7 +96,7 @@ function parseArgs(argv: string[]): PairwiseArgs { } return { - dataset: get('--dataset') ?? DEFAULTS.DATASET_NAME, + dataset: get('--dataset') ?? DEFAULT_DATASET, judges: parsePositiveInt(get('--judges'), '--judges') ?? Number(DEFAULTS.NUM_JUDGES), iterations: parsePositiveInt(get('--iterations'), '--iterations') ?? Number(DEFAULTS.REPETITIONS), @@ -230,24 +240,6 @@ interface ExampleRecord { feedback: Feedback[]; } -/** - * Eval-only suffix appended to every dataset prompt. Pushes the agent past - * its production "ask before assuming / set up credentials first" instinct - * — there is no human in the loop, so a clarification turn is a guaranteed - * `no_workflow_built`. Lives in the harness, not the production builder - * prompt, so production behavior is unaffected. - * - * Strictly describes the eval environment and the required terminal action - * (call `submit-workflow`). Does not name SDK helpers or otherwise lead the - * agent toward specific implementation choices — those are what the eval - * measures. - */ -const EVAL_PROMPT_SUFFIX = - '\n\n---\n' + - 'You are running inside an automated, non-interactive evaluation. ' + - 'There is no human to answer follow-up questions. ' + - 'Do not call `ask-user` and do not ask for clarification — pick reasonable defaults and proceed.'; - async function runExample( example: DatasetExample, iteration: number, @@ -262,8 +254,25 @@ async function runExample( 'chunks', `${safeFilename(`${example.id}_${iteration}`)}.jsonl`, ); + // Wrap the prompt the same way the production orchestrator wraps the spec + // it hands to the builder sub-agent (see `build-workflow-agent.tool.ts`). + // Keeping this aligned with prod is what closes the eval/prod gap — + // `DETACHED_BUILDER_REQUIREMENTS` is what tells the builder it must + // `submit-workflow` then `verify-built-workflow` before stopping. + // + // `workItemId` round-trips: the briefing's `additionalContext` tells the + // agent its work-item ID, the agent passes it to `verify-built-workflow`, + // which reads back the build outcome from the in-memory + // `workflowTaskService` keyed on the same ID. + const workItemId = 'wi_' + nanoid(8); + const builderPrompt = await buildSubAgentBriefing({ + task: example.prompt, + additionalContext: `[WORK ITEM ID: ${workItemId}]`, + requirements: DETACHED_BUILDER_REQUIREMENTS, + }); const build = await buildInProcess({ - prompt: example.prompt + EVAL_PROMPT_SUFFIX, + prompt: builderPrompt, + workItemId, timeoutMs: args.timeoutMs, logPath, sandboxFactory, @@ -336,6 +345,21 @@ interface Summary { buildFailures: Record; primaryPassRate: number; avgDiagnostic: number; + /** Total `submit-workflow` tool invocations across all records. */ + submitCallsTotal: number; + /** Mean `submit-workflow` invocations per build. 1.0 = every build called + * submit exactly once; >1.0 = builds had to fix and re-submit. */ + avgSubmitCalls: number; + /** (errored tool calls) / (total tool calls) micro-averaged across all + * runs. Captures how rough the build path was even on builds that + * eventually succeeded — every TypeScript compile error or failed + * domain tool call shows up here. */ + toolCallErrorRate: number; + /** Total tool calls observed (used as the error-rate denominator and + * surfaced for context). */ + toolCallsTotal: number; + /** Total errored tool calls observed (numerator of `toolCallErrorRate`). */ + toolCallErrors: number; }; interactivity: { askUserCount: number; @@ -386,12 +410,17 @@ async function writeOutputs( 'durationMs', 'askUserCount', 'planToolCount', + 'submitCalls', + 'toolCalls', + 'toolCallErrors', 'pairwisePrimary', 'pairwiseDiagnostic', 'pairwiseJudgesPassed', ].join(','); const csvRows = records.map((r) => { const find = (m: string) => r.feedback.find((f) => f.metric === m)?.score ?? ''; + const submits = r.toolCalls.filter((tc) => tc.toolName === 'submit-workflow').length; + const errors = r.toolCalls.filter(isErroredToolCall).length; return [ r.exampleId, r.iteration, @@ -400,6 +429,9 @@ async function writeOutputs( r.build.durationMs, r.build.interactivity.askUserCount, r.build.interactivity.planToolCount, + submits, + r.toolCalls.length, + errors, find('pairwise_primary'), find('pairwise_diagnostic'), find('pairwise_judges_passed'), @@ -420,6 +452,9 @@ async function writeOutputs( let askUserCount = 0; let planToolCount = 0; let autoApprovedSuspensions = 0; + let submitCallsTotal = 0; + let toolCallsTotal = 0; + let toolCallErrors = 0; for (const record of records) { if (record.build.success) buildSuccess++; @@ -433,6 +468,18 @@ async function writeOutputs( allMockedCreds.add(type); } + // `toolCalls` is the ordered timeline captured by the trace collector. + // We count any tool call that errored OR returned a failed result — + // hard Mastra tool failures are rare, but `submit-workflow` rejections + // and `execute_command` returning a non-zero `tsc` exit are common and + // dominate the "rough path" signal we care about. Suspensions are + // benign (auto-approved or surfaced via `errorClass` separately). + for (const tc of record.toolCalls) { + toolCallsTotal++; + if (isErroredToolCall(tc)) toolCallErrors++; + if (tc.toolName === 'submit-workflow') submitCallsTotal++; + } + const primary = record.feedback.find((f) => f.metric === 'pairwise_primary')?.score; if (typeof primary === 'number') { primaryPassSum += primary; @@ -469,6 +516,11 @@ async function writeOutputs( buildFailures, primaryPassRate: primaryPassCount ? primaryPassSum / primaryPassCount : 0, avgDiagnostic: diagnosticCount ? diagnosticSum / diagnosticCount : 0, + submitCallsTotal, + avgSubmitCalls: records.length ? submitCallsTotal / records.length : 0, + toolCallsTotal, + toolCallErrors, + toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0, }, interactivity: { askUserCount, @@ -645,6 +697,38 @@ function safeFilename(s: string): string { return s.replace(/[^a-zA-Z0-9._-]+/g, '_').slice(0, 120); } +/** + * Whether a tool call should count toward the "tool error rate" metric. + * + * Catches three flavours: + * 1. **Hard Mastra failure** (`trace.error` set) — tool threw / rejected. + * 2. **Tool returned a failed result object** — e.g. `submit-workflow` + * returning `{ success: false, errors: [...] }`. Looks at top-level + * `success === false` or non-empty `errors` array, plus a string + * `error` field. + * 3. **`execute_command` returned a non-zero exit code** — e.g. `tsc` + * spitting out compile errors. Looks for an `Exit code: ` + * marker in the result text. + */ +function isErroredToolCall(trace: ToolCallTrace): boolean { + if (trace.error !== undefined) return true; + const r = trace.result; + if (r === null || r === undefined) return false; + + if (typeof r === 'object' && !Array.isArray(r)) { + const obj = r as Record; + if (obj.success === false) return true; + if (typeof obj.error === 'string' && obj.error.length > 0) return true; + if (Array.isArray(obj.errors) && obj.errors.length > 0) return true; + } + + if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) { + return true; + } + + return false; +} + async function fileExists(filePath: string): Promise { try { await fs.access(filePath); diff --git a/packages/@n8n/instance-ai/evaluations/cli/report.ts b/packages/@n8n/instance-ai/evaluations/cli/report.ts index 890f83c4416..a3c657cd78a 100644 --- a/packages/@n8n/instance-ai/evaluations/cli/report.ts +++ b/packages/@n8n/instance-ai/evaluations/cli/report.ts @@ -34,6 +34,11 @@ interface SummaryJson { buildFailures: Record; primaryPassRate: number; avgDiagnostic: number; + submitCallsTotal?: number; + avgSubmitCalls?: number; + toolCallsTotal?: number; + toolCallErrors?: number; + toolCallErrorRate?: number; }; interactivity: { askUserCount: number; @@ -168,6 +173,36 @@ function escapeAttr(input: string): string { return input.replace(/&/g, '&').replace(/'/g, ''').replace(/"/g, '"'); } +/** + * Whether a tool call should count toward the "tool error rate" metric. + * Mirrors `isErroredToolCall` in `pairwise.ts` — kept in sync by hand + * because the report walks pre-saved `results.jsonl` files written by + * older runs of the eval too. + */ +function isErroredToolCall(trace: ToolCallTrace): boolean { + if (trace.error !== undefined) return true; + const r = trace.result; + if (r === null || r === undefined) return false; + if (typeof r === 'object' && !Array.isArray(r)) { + const obj = r as Record; + if (obj.success === false) return true; + if (typeof obj.error === 'string' && obj.error.length > 0) return true; + if (Array.isArray(obj.errors) && obj.errors.length > 0) return true; + } + if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true; + return false; +} + +function countSubmitCalls(traces: ToolCallTrace[] | undefined): number { + if (!traces) return 0; + return traces.filter((t) => t.toolName === 'submit-workflow').length; +} + +function countToolCallErrors(traces: ToolCallTrace[] | undefined): number { + if (!traces) return 0; + return traces.filter(isErroredToolCall).length; +} + function findScore(feedback: FeedbackEntry[], metric: string): number | undefined { return feedback.find((f) => f.metric === metric)?.score; } @@ -333,6 +368,15 @@ function renderExample(record: ResultRecord, idPrefix: string): string { if (interact.mockedCredentialTypes.length > 0) interactBits.push(`mocked creds: ${interact.mockedCredentialTypes.join(', ')}`); + // Per-record build-path stats. Surfaced inline in the summary line so a + // reviewer can scan retries / errors without expanding each row. Numbers + // match the columns added to `results.csv`. + const submitCalls = countSubmitCalls(record.toolCalls); + const toolErrors = countToolCallErrors(record.toolCalls); + const buildStatBits: string[] = []; + if (submitCalls > 0) buildStatBits.push(`submit ×${submitCalls}`); + if (toolErrors > 0) buildStatBits.push(`err ×${toolErrors}`); + const errorBlock = record.build.errorMessage ? `
${escapeHtml(record.build.errorMessage)}
` : ''; @@ -349,6 +393,7 @@ function renderExample(record: ResultRecord, idPrefix: string): string { #${record.iteration} ${record.build.durationMs}ms + ${buildStatBits.length > 0 ? `${buildStatBits.map(escapeHtml).join(' · ')}` : ''} ${renderFeedbackBadges(record.feedback)}
@@ -412,6 +457,16 @@ function renderRun(run: Run, index: number): string { Build fail: ${totalFailures}${failureDetail ? ` (${escapeHtml(failureDetail)})` : ''} Primary pass rate: ${pct(s.totals.primaryPassRate)} Avg diagnostic: ${s.totals.avgDiagnostic.toFixed(2)} + ${ + s.totals.toolCallErrorRate !== undefined + ? `Tool error rate: ${pct(s.totals.toolCallErrorRate)}${s.totals.toolCallErrors !== undefined && s.totals.toolCallsTotal !== undefined ? ` (${s.totals.toolCallErrors}/${s.totals.toolCallsTotal})` : ''}` + : '' + } + ${ + s.totals.avgSubmitCalls !== undefined + ? `Submit calls: ${s.totals.submitCallsTotal ?? 0} total, ${s.totals.avgSubmitCalls.toFixed(2)} avg/build` + : '' + }
${ s.interactivity.askUserCount > 0 || @@ -504,6 +559,7 @@ export function renderDocument(runs: Run[]): string { details.example > summary .example-id { font-family: ui-monospace, monospace; font-size: 11px; color: var(--muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } details.example > summary .iteration { color: var(--muted); font-size: 11px; } details.example > summary .duration { color: var(--muted); font-size: 11px; text-align: right; } + details.example > summary .build-stats { color: var(--muted); font-size: 11px; text-align: right; white-space: nowrap; } details.example > summary .badges { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-end; } .badge { font-size: 11px; padding: 2px 6px; border-radius: 3px; background: rgba(139,148,158,0.18); color: var(--fg); } .badge.badge-pass { background: rgba(63,185,80,0.2); color: var(--pass); } diff --git a/packages/@n8n/instance-ai/evaluations/harness/in-process-builder.ts b/packages/@n8n/instance-ai/evaluations/harness/in-process-builder.ts index db67f8f0d06..f1ca1224c7c 100644 --- a/packages/@n8n/instance-ai/evaluations/harness/in-process-builder.ts +++ b/packages/@n8n/instance-ai/evaluations/harness/in-process-builder.ts @@ -40,6 +40,10 @@ import path from 'node:path'; import { normalizeWorkflow } from './normalize-workflow'; import { stringifyError, truncate } from './redact'; import { createStubServices, defaultNodesJsonPath, type StubServiceHandle } from './stub-services'; +import { + createInMemoryWorkflowTaskService, + type InMemoryWorkflowTaskService, +} from './stub-workflow-task-service'; import type { SimpleWorkflow } from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise'; import { registerWithMastra } from '../../src/agent/register-with-mastra'; import { MAX_STEPS } from '../../src/constants/max-steps'; @@ -48,9 +52,15 @@ import type { Logger } from '../../src/logger'; import { executeResumableStream } from '../../src/runtime/resumable-stream-executor'; import { createAllTools } from '../../src/tools'; import { createSandboxBuilderAgentPrompt } from '../../src/tools/orchestration/build-workflow-agent.prompt'; -import { createSubmitWorkflowTool } from '../../src/tools/workflows/submit-workflow.tool'; -import type { ModelConfig } from '../../src/types'; +import { createVerifyBuiltWorkflowTool } from '../../src/tools/orchestration/verify-built-workflow.tool'; +import { + createSubmitWorkflowTool, + type SubmitWorkflowAttempt, +} from '../../src/tools/workflows/submit-workflow.tool'; +import type { ModelConfig, OrchestrationContext } from '../../src/types'; import { asResumable } from '../../src/utils/stream-helpers'; +import { createRemediation } from '../../src/workflow-loop/remediation'; +import type { WorkflowBuildOutcome } from '../../src/workflow-loop/workflow-loop-state'; import type { BuilderSandboxFactory, BuilderWorkspace, @@ -130,6 +140,15 @@ export interface BuildInProcessOptions { * `WorkflowJSON`. The workspace is destroyed on completion. */ sandboxFactory: BuilderSandboxFactory; + /** + * Optional pre-generated work item ID. Pass this when the caller has + * already embedded `[WORK ITEM ID: ${workItemId}]` into the prompt's + * briefing — `verify-built-workflow` reads the same value back from the + * in-memory `workflowTaskService` keyed on this ID. When omitted, a + * fresh ID is generated; in that case `verify-built-workflow` won't be + * called by the agent (the briefing didn't tell it what value to pass). + */ + workItemId?: string; } // --------------------------------------------------------------------------- @@ -230,6 +249,35 @@ export async function buildInProcess( } const prompt = createSandboxBuilderAgentPrompt(root); + // Per-build identifiers — match what production (`build-workflow-agent.tool.ts`) + // generates per orchestrator-dispatched task. The builder agent reads + // `workItemId` from the briefing's `additionalContext`, then passes it to + // `verify-built-workflow` to round-trip its build outcome. + const workItemId = options.workItemId ?? 'wi_' + nanoid(8); + const taskId = 'eval-task-' + nanoid(6); + const threadId = 'eval-thread-' + nanoid(6); + const runId = 'eval-run-' + nanoid(6); + const agentId = 'eval-builder-' + nanoid(6); + const logger = silentLogger(); + + // In-memory build-outcome / verification store. Lives for the duration + // of this single build; never shared. The workflowTaskService interface + // is what `verify-built-workflow` reads from after `submit-workflow` + // records the attempt below. + const workflowTaskService: InMemoryWorkflowTaskService = createInMemoryWorkflowTaskService(); + + // Minimal OrchestrationContext shim for `createVerifyBuiltWorkflowTool`. + // Verify-built-workflow only reads `workflowTaskService`, `domainContext`, + // `runId`, and `logger` at runtime — the rest of OrchestrationContext is + // orchestrator scaffolding the builder doesn't touch. + const verifyContext = { + threadId, + runId, + logger, + domainContext: services.context, + workflowTaskService, + } as unknown as OrchestrationContext; + const sandboxToolNames = [ 'nodes', 'workflows', @@ -241,9 +289,23 @@ export async function buildInProcess( const tool = (allTools as Record)[name]; if (tool) builderTools[name] = tool; } - builderTools['submit-workflow'] = createSubmitWorkflowTool(services.context, builderWs.workspace); - const agentId = 'eval-builder-' + nanoid(6); + // `submit-workflow` reports each attempt back via the onAttempt callback. + // Production wires this to `workflowTaskService.reportBuildOutcome` so the + // builder loop and `verify-built-workflow` can read it. We mirror that + // here so the same prompt contract works in eval. + builderTools['submit-workflow'] = createSubmitWorkflowTool( + services.context, + builderWs.workspace, + undefined, + async (attempt) => { + await workflowTaskService.reportBuildOutcome( + toWorkflowBuildOutcome(workItemId, runId, taskId, attempt), + ); + }, + ); + builderTools['verify-built-workflow'] = createVerifyBuiltWorkflowTool(verifyContext); + const agent = new Agent({ id: agentId, name: 'Eval Workflow Builder', @@ -266,14 +328,11 @@ export async function buildInProcess( const abortController = new AbortController(); const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs); - const threadId = 'eval-thread-' + nanoid(6); - const runId = 'eval-run-' + nanoid(6); const eventBus = wrapEventBusWithObserver(createInMemoryEventBus(), (event) => { observeEvent(event, interactivity); traceCollector.observe(event); chunkLog?.writeEvent(event); }); - const logger = silentLogger(); let finalText: string | undefined; try { @@ -310,6 +369,24 @@ export async function buildInProcess( interactivity.askUserCount++; } }, + // Match production (`consumeStreamWithHitl`): when a suspension + // auto-resumes, pass `maxSteps` and the same providerOptions to + // `resumeStream`. Without these, Mastra's `resumeStream` defaults + // to its built-in `stepCountIs(5)` cap — which silently truncates + // the agent's post-suspension work after every HITL tool. In a + // builder run that creates data tables before writing the file, + // the resume budget gets eaten by the time the agent reaches + // `submit-workflow`, and the run dies mid-flow with a stale + // `finishReason: 'suspended'`. See `consume-with-hitl.ts` for + // the production wiring. + buildResumeOptions: ({ mastraRunId, suspension }) => ({ + runId: mastraRunId, + toolCallId: suspension.toolCallId, + maxSteps, + providerOptions: { + anthropic: { cacheControl: { type: 'ephemeral' as const } }, + }, + }), }, }); @@ -799,3 +876,62 @@ async function safeSettle(value: Promise | undefined): Promise(); + const verdicts = new Map(); + + return { + async reportBuildOutcome(outcome) { + outcomes.set(outcome.workItemId, outcome); + return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction; + }, + + async reportVerificationVerdict(verdict) { + verdicts.set(verdict.workItemId, verdict); + return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction; + }, + + async getBuildOutcome(workItemId) { + return outcomes.get(workItemId); + }, + + async getWorkflowLoopState(_workItemId): Promise { + // Eval has no loop controller — verify-built-workflow tolerates undefined. + return undefined; + }, + + async updateBuildOutcome(workItemId, update) { + const existing = outcomes.get(workItemId); + if (!existing) return; + outcomes.set(workItemId, { ...existing, ...update }); + }, + + peekOutcome(workItemId) { + return outcomes.get(workItemId); + }, + + peekVerdict(workItemId) { + return verdicts.get(workItemId); + }, + }; +} diff --git a/packages/@n8n/instance-ai/src/tools/orchestration/build-workflow-agent.tool.ts b/packages/@n8n/instance-ai/src/tools/orchestration/build-workflow-agent.tool.ts index fd5c649f281..0fd6ba55881 100644 --- a/packages/@n8n/instance-ai/src/tools/orchestration/build-workflow-agent.tool.ts +++ b/packages/@n8n/instance-ai/src/tools/orchestration/build-workflow-agent.tool.ts @@ -304,7 +304,7 @@ async function buildOutcomeWithLatestVerification( return await finalBuildOutcome(context, workItemId, outcome); } -const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract +export const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract You are running as a detached background task. Do not stop after a successful submit — verify the workflow works.