chore: Align pairwise eval builder with production handover (#30019)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mutasem Aldmour 2026-05-11 13:00:37 +02:00 committed by GitHub
parent 7094b48c94
commit d0367a00e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 499 additions and 44 deletions

View File

@ -43,6 +43,13 @@ export interface BuilderRecord {
feedback: FeedbackEntry[];
tokenInput?: number;
tokenOutput?: number;
/** Number of `submit-workflow` calls during the build. IA-only EE
* doesn't capture a tool-call timeline in the comparable shape. */
submitCalls?: number;
/** Number of tool calls that errored or returned a failed result. */
toolCallErrors?: number;
/** Total tool calls observed, used as the error-rate denominator. */
toolCallsTotal?: number;
}
interface BuilderSummary {
@ -59,6 +66,17 @@ interface BuilderSummary {
primaryPassRate: number;
avgDiagnostic: number;
avgDurationMs: number;
/** Total `submit-workflow` calls aggregated across IA records. Undefined
* for EE (which doesn't capture a comparable tool-call timeline). */
submitCallsTotal?: number;
/** Mean `submit-workflow` calls per record (IA only). */
avgSubmitCalls?: number;
/** Total tool calls observed across IA records. */
toolCallsTotal?: number;
/** Total errored tool calls observed across IA records. */
toolCallErrors?: number;
/** `toolCallErrors / toolCallsTotal` micro-averaged. IA-only. */
toolCallErrorRate?: number;
};
}
@ -71,6 +89,16 @@ interface BuilderRun {
// Instance AI loader (writes results.jsonl + workflows/<id>.json + summary.json)
// ---------------------------------------------------------------------------
interface IAToolCallTrace {
step: number;
toolCallId: string;
toolName: string;
args?: unknown;
result?: unknown;
error?: string;
elapsedMs?: number;
}
interface IAResultRecord {
exampleId: string;
iteration: number;
@ -86,6 +114,25 @@ interface IAResultRecord {
tokenUsage?: { input?: number; output?: number };
};
feedback: Array<{ metric: string; score: number; kind?: string; comment?: string }>;
toolCalls?: IAToolCallTrace[];
}
/**
* Whether a tool call should count toward the "tool error rate" metric.
* Mirrors `isErroredToolCall` in `pairwise.ts`.
*/
function isErroredIAToolCall(trace: IAToolCallTrace): boolean {
if (trace.error !== undefined) return true;
const r = trace.result;
if (r === null || r === undefined) return false;
if (typeof r === 'object' && !Array.isArray(r)) {
const obj = r as Record<string, unknown>;
if (obj.success === false) return true;
if (typeof obj.error === 'string' && obj.error.length > 0) return true;
if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
}
if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true;
return false;
}
interface IASummary {
@ -125,7 +172,9 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
// Use only iteration 1 for a fair 1:1 comparison.
.filter((r) => r.iteration === 1);
const normalized: BuilderRecord[] = records.map((r) => ({
const normalized: BuilderRecord[] = records.map((r) => {
const tcs = r.toolCalls ?? [];
return {
prompt: r.prompt,
exampleId: r.exampleId,
dos: r.dos,
@ -138,7 +187,11 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
feedback: r.feedback,
tokenInput: r.build.tokenUsage?.input,
tokenOutput: r.build.tokenUsage?.output,
}));
submitCalls: tcs.filter((tc) => tc.toolName === 'submit-workflow').length,
toolCallErrors: tcs.filter(isErroredIAToolCall).length,
toolCallsTotal: tcs.length,
};
});
const avgDuration =
normalized.length === 0
@ -166,6 +219,10 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
? 0
: diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length;
const submitCallsTotal = normalized.reduce((s, r) => s + (r.submitCalls ?? 0), 0);
const toolCallsTotal = normalized.reduce((s, r) => s + (r.toolCallsTotal ?? 0), 0);
const toolCallErrors = normalized.reduce((s, r) => s + (r.toolCallErrors ?? 0), 0);
return {
summary: {
label: `${summary.builder} (instance-ai)`,
@ -181,6 +238,11 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
primaryPassRate,
avgDiagnostic,
avgDurationMs: avgDuration,
submitCallsTotal,
avgSubmitCalls: normalized.length ? submitCallsTotal / normalized.length : 0,
toolCallsTotal,
toolCallErrors,
toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0,
},
},
records: normalized,
@ -564,6 +626,12 @@ function renderBuilderColumn(label: string, record: BuilderRecord | undefined):
if (record.tokenInput !== undefined && record.tokenOutput !== undefined) {
metaParts.push(`<span>${record.tokenInput}+${record.tokenOutput} tok</span>`);
}
if (record.submitCalls !== undefined && record.submitCalls > 0) {
metaParts.push(`<span>submit ×${record.submitCalls}</span>`);
}
if (record.toolCallErrors !== undefined && record.toolCallErrors > 0) {
metaParts.push(`<span>err ×${record.toolCallErrors}</span>`);
}
const errorBlock = record.errorMessage
? `<div class="error">${escapeHtml(record.errorMessage)}</div>`
@ -676,6 +744,16 @@ function renderSummaryCard(
<div class="metric"><strong>${summary.totals.avgDiagnostic.toFixed(2)}</strong><span>avg diagnostic</span></div>
<div class="metric"><strong>${formatDuration(summary.totals.avgDurationMs)}</strong><span>avg build time</span></div>
<div class="metric"><strong>${summary.totals.buildSuccess}/${totalRecords}</strong><span>built ok</span></div>
${
summary.totals.toolCallErrorRate !== undefined
? `<div class="metric"><strong>${pct(summary.totals.toolCallErrorRate)}</strong><span>tool error rate (${summary.totals.toolCallErrors ?? 0}/${summary.totals.toolCallsTotal ?? 0})</span></div>`
: ''
}
${
summary.totals.avgSubmitCalls !== undefined
? `<div class="metric"><strong>${summary.totals.avgSubmitCalls.toFixed(2)}</strong><span>avg submit calls</span></div>`
: ''
}
${failureBits ? `<div class="meta-row failures">Failures: ${escapeHtml(failureBits)}</div>` : ''}
</div>`;
}
@ -686,6 +764,8 @@ function renderMetricsNote(): string {
<span><b>Primary pass</b> workflow passes only if a majority of LLM judges (2 of 3) find zero "don't" violations. Computed over all prompt attempts; build failures count as fail.</span>
<span><b>Average diagnostic</b> mean fraction of criteria (dos + don'ts) satisfied across the dataset, averaged across judges. Range 01; gives partial credit.</span>
<span><b>Average build time</b> averaged across all attempts including failures, so build timeouts (20-min cap) inflate this number.</span>
<span><b>Tool error rate</b> fraction of tool calls that errored or returned a failed result (e.g. <code>tsc</code> non-zero exit, <code>submit-workflow</code> rejection). Captures build-path roughness even on builds that eventually succeeded. <i>IA-only.</i></span>
<span><b>Avg submit calls</b> mean <code>submit-workflow</code> invocations per build. 1.0 = clean first-try submit. <i>IA-only.</i></span>
<span><b>Verdicts</b> compare per-prompt primary pass between the two builders.</span>
</aside>`;
}

View File

@ -21,6 +21,7 @@
import { ChatAnthropic } from '@langchain/anthropic';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import { Client as LangSmithClient } from 'langsmith';
import { nanoid } from 'nanoid';
import { promises as fs, readFileSync } from 'node:fs';
import path from 'node:path';
import pLimit from 'p-limit';
@ -32,7 +33,9 @@ import {
type SimpleWorkflow,
} from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise';
import { DEFAULTS } from '../../../ai-workflow-builder.ee/evaluations/support/constants';
import { buildSubAgentBriefing } from '../../src/agent/sub-agent-briefing';
import type { Logger } from '../../src/logger';
import { DETACHED_BUILDER_REQUIREMENTS } from '../../src/tools/orchestration/build-workflow-agent.tool';
import { BuilderSandboxFactory } from '../../src/workspace/builder-sandbox-factory';
import type { SandboxConfig } from '../../src/workspace/create-workspace';
import { SnapshotManager } from '../../src/workspace/snapshot-manager';
@ -44,6 +47,13 @@ import {
import { createLogger, type EvalLogger } from '../harness/logger';
import { resolveSandboxConfig } from '../harness/sandbox-config';
/** Default dataset orchestrator-plan-derived spec rows. Each row's prompt
* is the spec the production planner hands the builder via
* `dispatchPlannedTask`. Pair this with the production briefing wrapper
* (`DETACHED_BUILDER_REQUIREMENTS`) below to keep the eval aligned with
* what the builder sees in production. */
const DEFAULT_DATASET = 'instance-ai-builder-from-plans';
// ---------------------------------------------------------------------------
// CLI args
// ---------------------------------------------------------------------------
@ -86,7 +96,7 @@ function parseArgs(argv: string[]): PairwiseArgs {
}
return {
dataset: get('--dataset') ?? DEFAULTS.DATASET_NAME,
dataset: get('--dataset') ?? DEFAULT_DATASET,
judges: parsePositiveInt(get('--judges'), '--judges') ?? Number(DEFAULTS.NUM_JUDGES),
iterations:
parsePositiveInt(get('--iterations'), '--iterations') ?? Number(DEFAULTS.REPETITIONS),
@ -230,24 +240,6 @@ interface ExampleRecord {
feedback: Feedback[];
}
/**
* Eval-only suffix appended to every dataset prompt. Pushes the agent past
* its production "ask before assuming / set up credentials first" instinct
* there is no human in the loop, so a clarification turn is a guaranteed
* `no_workflow_built`. Lives in the harness, not the production builder
* prompt, so production behavior is unaffected.
*
* Strictly describes the eval environment and the required terminal action
* (call `submit-workflow`). Does not name SDK helpers or otherwise lead the
* agent toward specific implementation choices those are what the eval
* measures.
*/
const EVAL_PROMPT_SUFFIX =
'\n\n---\n' +
'You are running inside an automated, non-interactive evaluation. ' +
'There is no human to answer follow-up questions. ' +
'Do not call `ask-user` and do not ask for clarification — pick reasonable defaults and proceed.';
async function runExample(
example: DatasetExample,
iteration: number,
@ -262,8 +254,25 @@ async function runExample(
'chunks',
`${safeFilename(`${example.id}_${iteration}`)}.jsonl`,
);
// Wrap the prompt the same way the production orchestrator wraps the spec
// it hands to the builder sub-agent (see `build-workflow-agent.tool.ts`).
// Keeping this aligned with prod is what closes the eval/prod gap —
// `DETACHED_BUILDER_REQUIREMENTS` is what tells the builder it must
// `submit-workflow` then `verify-built-workflow` before stopping.
//
// `workItemId` round-trips: the briefing's `additionalContext` tells the
// agent its work-item ID, the agent passes it to `verify-built-workflow`,
// which reads back the build outcome from the in-memory
// `workflowTaskService` keyed on the same ID.
const workItemId = 'wi_' + nanoid(8);
const builderPrompt = await buildSubAgentBriefing({
task: example.prompt,
additionalContext: `[WORK ITEM ID: ${workItemId}]`,
requirements: DETACHED_BUILDER_REQUIREMENTS,
});
const build = await buildInProcess({
prompt: example.prompt + EVAL_PROMPT_SUFFIX,
prompt: builderPrompt,
workItemId,
timeoutMs: args.timeoutMs,
logPath,
sandboxFactory,
@ -336,6 +345,21 @@ interface Summary {
buildFailures: Record<string, number>;
primaryPassRate: number;
avgDiagnostic: number;
/** Total `submit-workflow` tool invocations across all records. */
submitCallsTotal: number;
/** Mean `submit-workflow` invocations per build. 1.0 = every build called
* submit exactly once; >1.0 = builds had to fix and re-submit. */
avgSubmitCalls: number;
/** (errored tool calls) / (total tool calls) micro-averaged across all
* runs. Captures how rough the build path was even on builds that
* eventually succeeded every TypeScript compile error or failed
* domain tool call shows up here. */
toolCallErrorRate: number;
/** Total tool calls observed (used as the error-rate denominator and
* surfaced for context). */
toolCallsTotal: number;
/** Total errored tool calls observed (numerator of `toolCallErrorRate`). */
toolCallErrors: number;
};
interactivity: {
askUserCount: number;
@ -386,12 +410,17 @@ async function writeOutputs(
'durationMs',
'askUserCount',
'planToolCount',
'submitCalls',
'toolCalls',
'toolCallErrors',
'pairwisePrimary',
'pairwiseDiagnostic',
'pairwiseJudgesPassed',
].join(',');
const csvRows = records.map((r) => {
const find = (m: string) => r.feedback.find((f) => f.metric === m)?.score ?? '';
const submits = r.toolCalls.filter((tc) => tc.toolName === 'submit-workflow').length;
const errors = r.toolCalls.filter(isErroredToolCall).length;
return [
r.exampleId,
r.iteration,
@ -400,6 +429,9 @@ async function writeOutputs(
r.build.durationMs,
r.build.interactivity.askUserCount,
r.build.interactivity.planToolCount,
submits,
r.toolCalls.length,
errors,
find('pairwise_primary'),
find('pairwise_diagnostic'),
find('pairwise_judges_passed'),
@ -420,6 +452,9 @@ async function writeOutputs(
let askUserCount = 0;
let planToolCount = 0;
let autoApprovedSuspensions = 0;
let submitCallsTotal = 0;
let toolCallsTotal = 0;
let toolCallErrors = 0;
for (const record of records) {
if (record.build.success) buildSuccess++;
@ -433,6 +468,18 @@ async function writeOutputs(
allMockedCreds.add(type);
}
// `toolCalls` is the ordered timeline captured by the trace collector.
// We count any tool call that errored OR returned a failed result —
// hard Mastra tool failures are rare, but `submit-workflow` rejections
// and `execute_command` returning a non-zero `tsc` exit are common and
// dominate the "rough path" signal we care about. Suspensions are
// benign (auto-approved or surfaced via `errorClass` separately).
for (const tc of record.toolCalls) {
toolCallsTotal++;
if (isErroredToolCall(tc)) toolCallErrors++;
if (tc.toolName === 'submit-workflow') submitCallsTotal++;
}
const primary = record.feedback.find((f) => f.metric === 'pairwise_primary')?.score;
if (typeof primary === 'number') {
primaryPassSum += primary;
@ -469,6 +516,11 @@ async function writeOutputs(
buildFailures,
primaryPassRate: primaryPassCount ? primaryPassSum / primaryPassCount : 0,
avgDiagnostic: diagnosticCount ? diagnosticSum / diagnosticCount : 0,
submitCallsTotal,
avgSubmitCalls: records.length ? submitCallsTotal / records.length : 0,
toolCallsTotal,
toolCallErrors,
toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0,
},
interactivity: {
askUserCount,
@ -645,6 +697,38 @@ function safeFilename(s: string): string {
return s.replace(/[^a-zA-Z0-9._-]+/g, '_').slice(0, 120);
}
/**
* Whether a tool call should count toward the "tool error rate" metric.
*
* Catches three flavours:
* 1. **Hard Mastra failure** (`trace.error` set) tool threw / rejected.
* 2. **Tool returned a failed result object** e.g. `submit-workflow`
* returning `{ success: false, errors: [...] }`. Looks at top-level
* `success === false` or non-empty `errors` array, plus a string
* `error` field.
* 3. **`execute_command` returned a non-zero exit code** e.g. `tsc`
* spitting out compile errors. Looks for an `Exit code: <non-zero>`
* marker in the result text.
*/
function isErroredToolCall(trace: ToolCallTrace): boolean {
if (trace.error !== undefined) return true;
const r = trace.result;
if (r === null || r === undefined) return false;
if (typeof r === 'object' && !Array.isArray(r)) {
const obj = r as Record<string, unknown>;
if (obj.success === false) return true;
if (typeof obj.error === 'string' && obj.error.length > 0) return true;
if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
}
if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) {
return true;
}
return false;
}
async function fileExists(filePath: string): Promise<boolean> {
try {
await fs.access(filePath);

View File

@ -34,6 +34,11 @@ interface SummaryJson {
buildFailures: Record<string, number>;
primaryPassRate: number;
avgDiagnostic: number;
submitCallsTotal?: number;
avgSubmitCalls?: number;
toolCallsTotal?: number;
toolCallErrors?: number;
toolCallErrorRate?: number;
};
interactivity: {
askUserCount: number;
@ -168,6 +173,36 @@ function escapeAttr(input: string): string {
return input.replace(/&/g, '&amp;').replace(/'/g, '&apos;').replace(/"/g, '&quot;');
}
/**
* Whether a tool call should count toward the "tool error rate" metric.
* Mirrors `isErroredToolCall` in `pairwise.ts` kept in sync by hand
* because the report walks pre-saved `results.jsonl` files written by
* older runs of the eval too.
*/
function isErroredToolCall(trace: ToolCallTrace): boolean {
if (trace.error !== undefined) return true;
const r = trace.result;
if (r === null || r === undefined) return false;
if (typeof r === 'object' && !Array.isArray(r)) {
const obj = r as Record<string, unknown>;
if (obj.success === false) return true;
if (typeof obj.error === 'string' && obj.error.length > 0) return true;
if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
}
if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true;
return false;
}
function countSubmitCalls(traces: ToolCallTrace[] | undefined): number {
if (!traces) return 0;
return traces.filter((t) => t.toolName === 'submit-workflow').length;
}
function countToolCallErrors(traces: ToolCallTrace[] | undefined): number {
if (!traces) return 0;
return traces.filter(isErroredToolCall).length;
}
function findScore(feedback: FeedbackEntry[], metric: string): number | undefined {
return feedback.find((f) => f.metric === metric)?.score;
}
@ -333,6 +368,15 @@ function renderExample(record: ResultRecord, idPrefix: string): string {
if (interact.mockedCredentialTypes.length > 0)
interactBits.push(`mocked creds: ${interact.mockedCredentialTypes.join(', ')}`);
// Per-record build-path stats. Surfaced inline in the summary line so a
// reviewer can scan retries / errors without expanding each row. Numbers
// match the columns added to `results.csv`.
const submitCalls = countSubmitCalls(record.toolCalls);
const toolErrors = countToolCallErrors(record.toolCalls);
const buildStatBits: string[] = [];
if (submitCalls > 0) buildStatBits.push(`submit ×${submitCalls}`);
if (toolErrors > 0) buildStatBits.push(`err ×${toolErrors}`);
const errorBlock = record.build.errorMessage
? `<div class="error">${escapeHtml(record.build.errorMessage)}</div>`
: '';
@ -349,6 +393,7 @@ function renderExample(record: ResultRecord, idPrefix: string): string {
</div>
<span class="iteration">#${record.iteration}</span>
<span class="duration">${record.build.durationMs}ms</span>
${buildStatBits.length > 0 ? `<span class="build-stats">${buildStatBits.map(escapeHtml).join(' · ')}</span>` : ''}
<span class="badges">${renderFeedbackBadges(record.feedback)}</span>
</summary>
<div class="body">
@ -412,6 +457,16 @@ function renderRun(run: Run, index: number): string {
<span class="total ${totalFailures > 0 ? 'fail' : ''}"><strong>Build fail:</strong> ${totalFailures}${failureDetail ? ` (${escapeHtml(failureDetail)})` : ''}</span>
<span class="total"><strong>Primary pass rate:</strong> ${pct(s.totals.primaryPassRate)}</span>
<span class="total"><strong>Avg diagnostic:</strong> ${s.totals.avgDiagnostic.toFixed(2)}</span>
${
s.totals.toolCallErrorRate !== undefined
? `<span class="total ${s.totals.toolCallErrorRate > 0.1 ? 'fail' : ''}"><strong>Tool error rate:</strong> ${pct(s.totals.toolCallErrorRate)}${s.totals.toolCallErrors !== undefined && s.totals.toolCallsTotal !== undefined ? ` (${s.totals.toolCallErrors}/${s.totals.toolCallsTotal})` : ''}</span>`
: ''
}
${
s.totals.avgSubmitCalls !== undefined
? `<span class="total"><strong>Submit calls:</strong> ${s.totals.submitCallsTotal ?? 0} total, ${s.totals.avgSubmitCalls.toFixed(2)} avg/build</span>`
: ''
}
</div>
${
s.interactivity.askUserCount > 0 ||
@ -504,6 +559,7 @@ export function renderDocument(runs: Run[]): string {
details.example > summary .example-id { font-family: ui-monospace, monospace; font-size: 11px; color: var(--muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
details.example > summary .iteration { color: var(--muted); font-size: 11px; }
details.example > summary .duration { color: var(--muted); font-size: 11px; text-align: right; }
details.example > summary .build-stats { color: var(--muted); font-size: 11px; text-align: right; white-space: nowrap; }
details.example > summary .badges { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-end; }
.badge { font-size: 11px; padding: 2px 6px; border-radius: 3px; background: rgba(139,148,158,0.18); color: var(--fg); }
.badge.badge-pass { background: rgba(63,185,80,0.2); color: var(--pass); }

View File

@ -40,6 +40,10 @@ import path from 'node:path';
import { normalizeWorkflow } from './normalize-workflow';
import { stringifyError, truncate } from './redact';
import { createStubServices, defaultNodesJsonPath, type StubServiceHandle } from './stub-services';
import {
createInMemoryWorkflowTaskService,
type InMemoryWorkflowTaskService,
} from './stub-workflow-task-service';
import type { SimpleWorkflow } from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise';
import { registerWithMastra } from '../../src/agent/register-with-mastra';
import { MAX_STEPS } from '../../src/constants/max-steps';
@ -48,9 +52,15 @@ import type { Logger } from '../../src/logger';
import { executeResumableStream } from '../../src/runtime/resumable-stream-executor';
import { createAllTools } from '../../src/tools';
import { createSandboxBuilderAgentPrompt } from '../../src/tools/orchestration/build-workflow-agent.prompt';
import { createSubmitWorkflowTool } from '../../src/tools/workflows/submit-workflow.tool';
import type { ModelConfig } from '../../src/types';
import { createVerifyBuiltWorkflowTool } from '../../src/tools/orchestration/verify-built-workflow.tool';
import {
createSubmitWorkflowTool,
type SubmitWorkflowAttempt,
} from '../../src/tools/workflows/submit-workflow.tool';
import type { ModelConfig, OrchestrationContext } from '../../src/types';
import { asResumable } from '../../src/utils/stream-helpers';
import { createRemediation } from '../../src/workflow-loop/remediation';
import type { WorkflowBuildOutcome } from '../../src/workflow-loop/workflow-loop-state';
import type {
BuilderSandboxFactory,
BuilderWorkspace,
@ -130,6 +140,15 @@ export interface BuildInProcessOptions {
* `WorkflowJSON`. The workspace is destroyed on completion.
*/
sandboxFactory: BuilderSandboxFactory;
/**
* Optional pre-generated work item ID. Pass this when the caller has
* already embedded `[WORK ITEM ID: ${workItemId}]` into the prompt's
* briefing `verify-built-workflow` reads the same value back from the
* in-memory `workflowTaskService` keyed on this ID. When omitted, a
* fresh ID is generated; in that case `verify-built-workflow` won't be
* called by the agent (the briefing didn't tell it what value to pass).
*/
workItemId?: string;
}
// ---------------------------------------------------------------------------
@ -230,6 +249,35 @@ export async function buildInProcess(
}
const prompt = createSandboxBuilderAgentPrompt(root);
// Per-build identifiers — match what production (`build-workflow-agent.tool.ts`)
// generates per orchestrator-dispatched task. The builder agent reads
// `workItemId` from the briefing's `additionalContext`, then passes it to
// `verify-built-workflow` to round-trip its build outcome.
const workItemId = options.workItemId ?? 'wi_' + nanoid(8);
const taskId = 'eval-task-' + nanoid(6);
const threadId = 'eval-thread-' + nanoid(6);
const runId = 'eval-run-' + nanoid(6);
const agentId = 'eval-builder-' + nanoid(6);
const logger = silentLogger();
// In-memory build-outcome / verification store. Lives for the duration
// of this single build; never shared. The workflowTaskService interface
// is what `verify-built-workflow` reads from after `submit-workflow`
// records the attempt below.
const workflowTaskService: InMemoryWorkflowTaskService = createInMemoryWorkflowTaskService();
// Minimal OrchestrationContext shim for `createVerifyBuiltWorkflowTool`.
// Verify-built-workflow only reads `workflowTaskService`, `domainContext`,
// `runId`, and `logger` at runtime — the rest of OrchestrationContext is
// orchestrator scaffolding the builder doesn't touch.
const verifyContext = {
threadId,
runId,
logger,
domainContext: services.context,
workflowTaskService,
} as unknown as OrchestrationContext;
const sandboxToolNames = [
'nodes',
'workflows',
@ -241,9 +289,23 @@ export async function buildInProcess(
const tool = (allTools as Record<string, unknown>)[name];
if (tool) builderTools[name] = tool;
}
builderTools['submit-workflow'] = createSubmitWorkflowTool(services.context, builderWs.workspace);
const agentId = 'eval-builder-' + nanoid(6);
// `submit-workflow` reports each attempt back via the onAttempt callback.
// Production wires this to `workflowTaskService.reportBuildOutcome` so the
// builder loop and `verify-built-workflow` can read it. We mirror that
// here so the same prompt contract works in eval.
builderTools['submit-workflow'] = createSubmitWorkflowTool(
services.context,
builderWs.workspace,
undefined,
async (attempt) => {
await workflowTaskService.reportBuildOutcome(
toWorkflowBuildOutcome(workItemId, runId, taskId, attempt),
);
},
);
builderTools['verify-built-workflow'] = createVerifyBuiltWorkflowTool(verifyContext);
const agent = new Agent({
id: agentId,
name: 'Eval Workflow Builder',
@ -266,14 +328,11 @@ export async function buildInProcess(
const abortController = new AbortController();
const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs);
const threadId = 'eval-thread-' + nanoid(6);
const runId = 'eval-run-' + nanoid(6);
const eventBus = wrapEventBusWithObserver(createInMemoryEventBus(), (event) => {
observeEvent(event, interactivity);
traceCollector.observe(event);
chunkLog?.writeEvent(event);
});
const logger = silentLogger();
let finalText: string | undefined;
try {
@ -310,6 +369,24 @@ export async function buildInProcess(
interactivity.askUserCount++;
}
},
// Match production (`consumeStreamWithHitl`): when a suspension
// auto-resumes, pass `maxSteps` and the same providerOptions to
// `resumeStream`. Without these, Mastra's `resumeStream` defaults
// to its built-in `stepCountIs(5)` cap — which silently truncates
// the agent's post-suspension work after every HITL tool. In a
// builder run that creates data tables before writing the file,
// the resume budget gets eaten by the time the agent reaches
// `submit-workflow`, and the run dies mid-flow with a stale
// `finishReason: 'suspended'`. See `consume-with-hitl.ts` for
// the production wiring.
buildResumeOptions: ({ mastraRunId, suspension }) => ({
runId: mastraRunId,
toolCallId: suspension.toolCallId,
maxSteps,
providerOptions: {
anthropic: { cacheControl: { type: 'ephemeral' as const } },
},
}),
},
});
@ -799,3 +876,62 @@ async function safeSettle<T>(value: Promise<T> | undefined): Promise<T | undefin
return undefined;
}
}
/**
* Convert a `submit-workflow` attempt into a `WorkflowBuildOutcome`.
*
* Production's `build-workflow-agent.tool.ts` does the same thing inside the
* orchestrator. We mirror it here (minus orchestrator-only fields like
* triggerType detection) so `verify-built-workflow` finds a sensible outcome
* stored against the workItemId.
*/
function toWorkflowBuildOutcome(
workItemId: string,
runId: string,
taskId: string,
attempt: SubmitWorkflowAttempt,
): WorkflowBuildOutcome {
if (!attempt.success) {
return {
workItemId,
runId,
taskId,
submitted: false,
triggerType: 'manual_or_testable',
needsUserInput: false,
failureSignature: attempt.errors?.join('; '),
remediation: attempt.remediation,
summary: attempt.errors?.join(' ') ?? 'Workflow submission failed.',
};
}
const placeholderRemediation = attempt.hasUnresolvedPlaceholders
? createRemediation({
category: 'needs_setup',
shouldEdit: false,
reason: 'mocked_credentials_or_placeholders',
guidance:
'Workflow submitted successfully, but unresolved setup values remain. Stop code edits.',
})
: undefined;
return {
workItemId,
runId,
taskId,
workflowId: attempt.workflowId,
submitted: true,
// Eval doesn't run trigger-aware verification, so the value here is
// cosmetic — the verify tool branches on `executionService.run` result,
// not this field.
triggerType: 'manual_or_testable',
needsUserInput: Boolean(placeholderRemediation),
blockingReason: placeholderRemediation?.guidance,
mockedNodeNames: attempt.mockedNodeNames,
mockedCredentialTypes: attempt.mockedCredentialTypes,
mockedCredentialsByNode: attempt.mockedCredentialsByNode,
triggerNodes: attempt.triggerNodes,
verificationPinData: attempt.verificationPinData,
hasUnresolvedPlaceholders: attempt.hasUnresolvedPlaceholders,
remediation: placeholderRemediation ?? attempt.remediation,
summary: 'Workflow submitted and ready for verification.',
};
}

View File

@ -189,8 +189,22 @@ export async function createStubServices(
async list() {
return [];
},
async run() {
return stubExecutionResult('stub: execution disabled in eval');
// `verify-built-workflow` invokes `executionService.run()` after
// `submit-workflow` has captured the TS-compiled workflow JSON. The eval
// has no execution backend, but we want the builder agent's submit →
// verify → done sequence to complete cleanly so the production briefing
// (`DETACHED_BUILDER_REQUIREMENTS`) reads coherently. Returning a
// synthetic success here lets the agent terminate after submit. The
// eval's `buildSuccess` metric is derived from `submit-workflow` capture
// — never from this synthetic verdict — so this can't inflate the score.
async run(workflowId) {
return {
executionId: 'eval-exec-' + nanoid(),
status: 'success' as const,
data: { __eval_synthetic_verify__: [{ workflowId }] },
startedAt: new Date().toISOString(),
finishedAt: new Date().toISOString(),
};
},
async getStatus() {
return stubExecutionResult('stub: execution disabled in eval');

View File

@ -0,0 +1,85 @@
// ---------------------------------------------------------------------------
// In-memory `WorkflowTaskService` stub for the in-process eval harness.
//
// Production wires `workflowTaskService` through `instance-ai.service.ts` so
// the orchestrator can persist build outcomes per `workItemId` and the
// builder sub-agent can read them back via `verify-built-workflow`. The eval
// has no persistence layer, so we mirror the production interface against an
// in-memory map. This is enough for the production builder briefing
// (`DETACHED_BUILDER_REQUIREMENTS`) to read coherently:
//
// submit-workflow → reportBuildOutcome (writes to map)
// verify-built-workflow → getBuildOutcome (reads from map) + executes
// verify result → updateBuildOutcome (writes verification record)
//
// Each `buildInProcess` call gets its own service instance — no cross-build
// state leaks.
// ---------------------------------------------------------------------------
/* eslint-disable @typescript-eslint/require-await */
// All `WorkflowTaskService` methods are interface-async even when the
// implementation is synchronous in-memory bookkeeping.
import type { WorkflowTaskService } from '../../src/types';
import type {
VerificationResult,
WorkflowBuildOutcome,
WorkflowLoopAction,
WorkflowLoopState,
} from '../../src/workflow-loop/workflow-loop-state';
export interface InMemoryWorkflowTaskService extends WorkflowTaskService {
/** Read-only access to the latest stored outcome used by callers that
* want to inspect what the agent ended up with after the run. */
peekOutcome(workItemId: string): WorkflowBuildOutcome | undefined;
/** Read-only access to the latest stored verification verdict. */
peekVerdict(workItemId: string): VerificationResult | undefined;
}
/**
* Build a fresh in-memory WorkflowTaskService.
*
* `reportBuildOutcome` and `reportVerificationVerdict` always return
* `{ type: 'ignored', reason: 'eval-mode' }` because the eval has no
* workflow-loop controller there's no rebuild/verify state machine to
* advance. The builder agent only needs the read-back paths to work.
*/
export function createInMemoryWorkflowTaskService(): InMemoryWorkflowTaskService {
const outcomes = new Map<string, WorkflowBuildOutcome>();
const verdicts = new Map<string, VerificationResult>();
return {
async reportBuildOutcome(outcome) {
outcomes.set(outcome.workItemId, outcome);
return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction;
},
async reportVerificationVerdict(verdict) {
verdicts.set(verdict.workItemId, verdict);
return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction;
},
async getBuildOutcome(workItemId) {
return outcomes.get(workItemId);
},
async getWorkflowLoopState(_workItemId): Promise<WorkflowLoopState | undefined> {
// Eval has no loop controller — verify-built-workflow tolerates undefined.
return undefined;
},
async updateBuildOutcome(workItemId, update) {
const existing = outcomes.get(workItemId);
if (!existing) return;
outcomes.set(workItemId, { ...existing, ...update });
},
peekOutcome(workItemId) {
return outcomes.get(workItemId);
},
peekVerdict(workItemId) {
return verdicts.get(workItemId);
},
};
}

View File

@ -304,7 +304,7 @@ async function buildOutcomeWithLatestVerification(
return await finalBuildOutcome(context, workItemId, outcome);
}
const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract
export const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract
You are running as a detached background task. Do not stop after a successful submit verify the workflow works.