mirror of
https://github.com/n8n-io/n8n.git
synced 2026-05-12 16:10:30 +02:00
chore: Align pairwise eval builder with production handover (#30019)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7094b48c94
commit
d0367a00e8
|
|
@ -43,6 +43,13 @@ export interface BuilderRecord {
|
|||
feedback: FeedbackEntry[];
|
||||
tokenInput?: number;
|
||||
tokenOutput?: number;
|
||||
/** Number of `submit-workflow` calls during the build. IA-only — EE
|
||||
* doesn't capture a tool-call timeline in the comparable shape. */
|
||||
submitCalls?: number;
|
||||
/** Number of tool calls that errored or returned a failed result. */
|
||||
toolCallErrors?: number;
|
||||
/** Total tool calls observed, used as the error-rate denominator. */
|
||||
toolCallsTotal?: number;
|
||||
}
|
||||
|
||||
interface BuilderSummary {
|
||||
|
|
@ -59,6 +66,17 @@ interface BuilderSummary {
|
|||
primaryPassRate: number;
|
||||
avgDiagnostic: number;
|
||||
avgDurationMs: number;
|
||||
/** Total `submit-workflow` calls aggregated across IA records. Undefined
|
||||
* for EE (which doesn't capture a comparable tool-call timeline). */
|
||||
submitCallsTotal?: number;
|
||||
/** Mean `submit-workflow` calls per record (IA only). */
|
||||
avgSubmitCalls?: number;
|
||||
/** Total tool calls observed across IA records. */
|
||||
toolCallsTotal?: number;
|
||||
/** Total errored tool calls observed across IA records. */
|
||||
toolCallErrors?: number;
|
||||
/** `toolCallErrors / toolCallsTotal` micro-averaged. IA-only. */
|
||||
toolCallErrorRate?: number;
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -71,6 +89,16 @@ interface BuilderRun {
|
|||
// Instance AI loader (writes results.jsonl + workflows/<id>.json + summary.json)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface IAToolCallTrace {
|
||||
step: number;
|
||||
toolCallId: string;
|
||||
toolName: string;
|
||||
args?: unknown;
|
||||
result?: unknown;
|
||||
error?: string;
|
||||
elapsedMs?: number;
|
||||
}
|
||||
|
||||
interface IAResultRecord {
|
||||
exampleId: string;
|
||||
iteration: number;
|
||||
|
|
@ -86,6 +114,25 @@ interface IAResultRecord {
|
|||
tokenUsage?: { input?: number; output?: number };
|
||||
};
|
||||
feedback: Array<{ metric: string; score: number; kind?: string; comment?: string }>;
|
||||
toolCalls?: IAToolCallTrace[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether a tool call should count toward the "tool error rate" metric.
|
||||
* Mirrors `isErroredToolCall` in `pairwise.ts`.
|
||||
*/
|
||||
function isErroredIAToolCall(trace: IAToolCallTrace): boolean {
|
||||
if (trace.error !== undefined) return true;
|
||||
const r = trace.result;
|
||||
if (r === null || r === undefined) return false;
|
||||
if (typeof r === 'object' && !Array.isArray(r)) {
|
||||
const obj = r as Record<string, unknown>;
|
||||
if (obj.success === false) return true;
|
||||
if (typeof obj.error === 'string' && obj.error.length > 0) return true;
|
||||
if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
|
||||
}
|
||||
if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
interface IASummary {
|
||||
|
|
@ -125,7 +172,9 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
|
|||
// Use only iteration 1 for a fair 1:1 comparison.
|
||||
.filter((r) => r.iteration === 1);
|
||||
|
||||
const normalized: BuilderRecord[] = records.map((r) => ({
|
||||
const normalized: BuilderRecord[] = records.map((r) => {
|
||||
const tcs = r.toolCalls ?? [];
|
||||
return {
|
||||
prompt: r.prompt,
|
||||
exampleId: r.exampleId,
|
||||
dos: r.dos,
|
||||
|
|
@ -138,7 +187,11 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
|
|||
feedback: r.feedback,
|
||||
tokenInput: r.build.tokenUsage?.input,
|
||||
tokenOutput: r.build.tokenUsage?.output,
|
||||
}));
|
||||
submitCalls: tcs.filter((tc) => tc.toolName === 'submit-workflow').length,
|
||||
toolCallErrors: tcs.filter(isErroredIAToolCall).length,
|
||||
toolCallsTotal: tcs.length,
|
||||
};
|
||||
});
|
||||
|
||||
const avgDuration =
|
||||
normalized.length === 0
|
||||
|
|
@ -166,6 +219,10 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
|
|||
? 0
|
||||
: diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length;
|
||||
|
||||
const submitCallsTotal = normalized.reduce((s, r) => s + (r.submitCalls ?? 0), 0);
|
||||
const toolCallsTotal = normalized.reduce((s, r) => s + (r.toolCallsTotal ?? 0), 0);
|
||||
const toolCallErrors = normalized.reduce((s, r) => s + (r.toolCallErrors ?? 0), 0);
|
||||
|
||||
return {
|
||||
summary: {
|
||||
label: `${summary.builder} (instance-ai)`,
|
||||
|
|
@ -181,6 +238,11 @@ async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
|
|||
primaryPassRate,
|
||||
avgDiagnostic,
|
||||
avgDurationMs: avgDuration,
|
||||
submitCallsTotal,
|
||||
avgSubmitCalls: normalized.length ? submitCallsTotal / normalized.length : 0,
|
||||
toolCallsTotal,
|
||||
toolCallErrors,
|
||||
toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0,
|
||||
},
|
||||
},
|
||||
records: normalized,
|
||||
|
|
@ -564,6 +626,12 @@ function renderBuilderColumn(label: string, record: BuilderRecord | undefined):
|
|||
if (record.tokenInput !== undefined && record.tokenOutput !== undefined) {
|
||||
metaParts.push(`<span>${record.tokenInput}+${record.tokenOutput} tok</span>`);
|
||||
}
|
||||
if (record.submitCalls !== undefined && record.submitCalls > 0) {
|
||||
metaParts.push(`<span>submit ×${record.submitCalls}</span>`);
|
||||
}
|
||||
if (record.toolCallErrors !== undefined && record.toolCallErrors > 0) {
|
||||
metaParts.push(`<span>err ×${record.toolCallErrors}</span>`);
|
||||
}
|
||||
|
||||
const errorBlock = record.errorMessage
|
||||
? `<div class="error">${escapeHtml(record.errorMessage)}</div>`
|
||||
|
|
@ -676,6 +744,16 @@ function renderSummaryCard(
|
|||
<div class="metric"><strong>${summary.totals.avgDiagnostic.toFixed(2)}</strong><span>avg diagnostic</span></div>
|
||||
<div class="metric"><strong>${formatDuration(summary.totals.avgDurationMs)}</strong><span>avg build time</span></div>
|
||||
<div class="metric"><strong>${summary.totals.buildSuccess}/${totalRecords}</strong><span>built ok</span></div>
|
||||
${
|
||||
summary.totals.toolCallErrorRate !== undefined
|
||||
? `<div class="metric"><strong>${pct(summary.totals.toolCallErrorRate)}</strong><span>tool error rate (${summary.totals.toolCallErrors ?? 0}/${summary.totals.toolCallsTotal ?? 0})</span></div>`
|
||||
: ''
|
||||
}
|
||||
${
|
||||
summary.totals.avgSubmitCalls !== undefined
|
||||
? `<div class="metric"><strong>${summary.totals.avgSubmitCalls.toFixed(2)}</strong><span>avg submit calls</span></div>`
|
||||
: ''
|
||||
}
|
||||
${failureBits ? `<div class="meta-row failures">Failures: ${escapeHtml(failureBits)}</div>` : ''}
|
||||
</div>`;
|
||||
}
|
||||
|
|
@ -686,6 +764,8 @@ function renderMetricsNote(): string {
|
|||
<span><b>Primary pass</b> — workflow passes only if a majority of LLM judges (2 of 3) find zero "don't" violations. Computed over all prompt attempts; build failures count as fail.</span>
|
||||
<span><b>Average diagnostic</b> — mean fraction of criteria (dos + don'ts) satisfied across the dataset, averaged across judges. Range 0–1; gives partial credit.</span>
|
||||
<span><b>Average build time</b> — averaged across all attempts including failures, so build timeouts (20-min cap) inflate this number.</span>
|
||||
<span><b>Tool error rate</b> — fraction of tool calls that errored or returned a failed result (e.g. <code>tsc</code> non-zero exit, <code>submit-workflow</code> rejection). Captures build-path roughness even on builds that eventually succeeded. <i>IA-only.</i></span>
|
||||
<span><b>Avg submit calls</b> — mean <code>submit-workflow</code> invocations per build. 1.0 = clean first-try submit. <i>IA-only.</i></span>
|
||||
<span><b>Verdicts</b> compare per-prompt primary pass between the two builders.</span>
|
||||
</aside>`;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@
|
|||
import { ChatAnthropic } from '@langchain/anthropic';
|
||||
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||
import { Client as LangSmithClient } from 'langsmith';
|
||||
import { nanoid } from 'nanoid';
|
||||
import { promises as fs, readFileSync } from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import pLimit from 'p-limit';
|
||||
|
|
@ -32,7 +33,9 @@ import {
|
|||
type SimpleWorkflow,
|
||||
} from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise';
|
||||
import { DEFAULTS } from '../../../ai-workflow-builder.ee/evaluations/support/constants';
|
||||
import { buildSubAgentBriefing } from '../../src/agent/sub-agent-briefing';
|
||||
import type { Logger } from '../../src/logger';
|
||||
import { DETACHED_BUILDER_REQUIREMENTS } from '../../src/tools/orchestration/build-workflow-agent.tool';
|
||||
import { BuilderSandboxFactory } from '../../src/workspace/builder-sandbox-factory';
|
||||
import type { SandboxConfig } from '../../src/workspace/create-workspace';
|
||||
import { SnapshotManager } from '../../src/workspace/snapshot-manager';
|
||||
|
|
@ -44,6 +47,13 @@ import {
|
|||
import { createLogger, type EvalLogger } from '../harness/logger';
|
||||
import { resolveSandboxConfig } from '../harness/sandbox-config';
|
||||
|
||||
/** Default dataset — orchestrator-plan-derived spec rows. Each row's prompt
|
||||
* is the spec the production planner hands the builder via
|
||||
* `dispatchPlannedTask`. Pair this with the production briefing wrapper
|
||||
* (`DETACHED_BUILDER_REQUIREMENTS`) below to keep the eval aligned with
|
||||
* what the builder sees in production. */
|
||||
const DEFAULT_DATASET = 'instance-ai-builder-from-plans';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// CLI args
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -86,7 +96,7 @@ function parseArgs(argv: string[]): PairwiseArgs {
|
|||
}
|
||||
|
||||
return {
|
||||
dataset: get('--dataset') ?? DEFAULTS.DATASET_NAME,
|
||||
dataset: get('--dataset') ?? DEFAULT_DATASET,
|
||||
judges: parsePositiveInt(get('--judges'), '--judges') ?? Number(DEFAULTS.NUM_JUDGES),
|
||||
iterations:
|
||||
parsePositiveInt(get('--iterations'), '--iterations') ?? Number(DEFAULTS.REPETITIONS),
|
||||
|
|
@ -230,24 +240,6 @@ interface ExampleRecord {
|
|||
feedback: Feedback[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Eval-only suffix appended to every dataset prompt. Pushes the agent past
|
||||
* its production "ask before assuming / set up credentials first" instinct
|
||||
* — there is no human in the loop, so a clarification turn is a guaranteed
|
||||
* `no_workflow_built`. Lives in the harness, not the production builder
|
||||
* prompt, so production behavior is unaffected.
|
||||
*
|
||||
* Strictly describes the eval environment and the required terminal action
|
||||
* (call `submit-workflow`). Does not name SDK helpers or otherwise lead the
|
||||
* agent toward specific implementation choices — those are what the eval
|
||||
* measures.
|
||||
*/
|
||||
const EVAL_PROMPT_SUFFIX =
|
||||
'\n\n---\n' +
|
||||
'You are running inside an automated, non-interactive evaluation. ' +
|
||||
'There is no human to answer follow-up questions. ' +
|
||||
'Do not call `ask-user` and do not ask for clarification — pick reasonable defaults and proceed.';
|
||||
|
||||
async function runExample(
|
||||
example: DatasetExample,
|
||||
iteration: number,
|
||||
|
|
@ -262,8 +254,25 @@ async function runExample(
|
|||
'chunks',
|
||||
`${safeFilename(`${example.id}_${iteration}`)}.jsonl`,
|
||||
);
|
||||
// Wrap the prompt the same way the production orchestrator wraps the spec
|
||||
// it hands to the builder sub-agent (see `build-workflow-agent.tool.ts`).
|
||||
// Keeping this aligned with prod is what closes the eval/prod gap —
|
||||
// `DETACHED_BUILDER_REQUIREMENTS` is what tells the builder it must
|
||||
// `submit-workflow` then `verify-built-workflow` before stopping.
|
||||
//
|
||||
// `workItemId` round-trips: the briefing's `additionalContext` tells the
|
||||
// agent its work-item ID, the agent passes it to `verify-built-workflow`,
|
||||
// which reads back the build outcome from the in-memory
|
||||
// `workflowTaskService` keyed on the same ID.
|
||||
const workItemId = 'wi_' + nanoid(8);
|
||||
const builderPrompt = await buildSubAgentBriefing({
|
||||
task: example.prompt,
|
||||
additionalContext: `[WORK ITEM ID: ${workItemId}]`,
|
||||
requirements: DETACHED_BUILDER_REQUIREMENTS,
|
||||
});
|
||||
const build = await buildInProcess({
|
||||
prompt: example.prompt + EVAL_PROMPT_SUFFIX,
|
||||
prompt: builderPrompt,
|
||||
workItemId,
|
||||
timeoutMs: args.timeoutMs,
|
||||
logPath,
|
||||
sandboxFactory,
|
||||
|
|
@ -336,6 +345,21 @@ interface Summary {
|
|||
buildFailures: Record<string, number>;
|
||||
primaryPassRate: number;
|
||||
avgDiagnostic: number;
|
||||
/** Total `submit-workflow` tool invocations across all records. */
|
||||
submitCallsTotal: number;
|
||||
/** Mean `submit-workflow` invocations per build. 1.0 = every build called
|
||||
* submit exactly once; >1.0 = builds had to fix and re-submit. */
|
||||
avgSubmitCalls: number;
|
||||
/** (errored tool calls) / (total tool calls) micro-averaged across all
|
||||
* runs. Captures how rough the build path was even on builds that
|
||||
* eventually succeeded — every TypeScript compile error or failed
|
||||
* domain tool call shows up here. */
|
||||
toolCallErrorRate: number;
|
||||
/** Total tool calls observed (used as the error-rate denominator and
|
||||
* surfaced for context). */
|
||||
toolCallsTotal: number;
|
||||
/** Total errored tool calls observed (numerator of `toolCallErrorRate`). */
|
||||
toolCallErrors: number;
|
||||
};
|
||||
interactivity: {
|
||||
askUserCount: number;
|
||||
|
|
@ -386,12 +410,17 @@ async function writeOutputs(
|
|||
'durationMs',
|
||||
'askUserCount',
|
||||
'planToolCount',
|
||||
'submitCalls',
|
||||
'toolCalls',
|
||||
'toolCallErrors',
|
||||
'pairwisePrimary',
|
||||
'pairwiseDiagnostic',
|
||||
'pairwiseJudgesPassed',
|
||||
].join(',');
|
||||
const csvRows = records.map((r) => {
|
||||
const find = (m: string) => r.feedback.find((f) => f.metric === m)?.score ?? '';
|
||||
const submits = r.toolCalls.filter((tc) => tc.toolName === 'submit-workflow').length;
|
||||
const errors = r.toolCalls.filter(isErroredToolCall).length;
|
||||
return [
|
||||
r.exampleId,
|
||||
r.iteration,
|
||||
|
|
@ -400,6 +429,9 @@ async function writeOutputs(
|
|||
r.build.durationMs,
|
||||
r.build.interactivity.askUserCount,
|
||||
r.build.interactivity.planToolCount,
|
||||
submits,
|
||||
r.toolCalls.length,
|
||||
errors,
|
||||
find('pairwise_primary'),
|
||||
find('pairwise_diagnostic'),
|
||||
find('pairwise_judges_passed'),
|
||||
|
|
@ -420,6 +452,9 @@ async function writeOutputs(
|
|||
let askUserCount = 0;
|
||||
let planToolCount = 0;
|
||||
let autoApprovedSuspensions = 0;
|
||||
let submitCallsTotal = 0;
|
||||
let toolCallsTotal = 0;
|
||||
let toolCallErrors = 0;
|
||||
|
||||
for (const record of records) {
|
||||
if (record.build.success) buildSuccess++;
|
||||
|
|
@ -433,6 +468,18 @@ async function writeOutputs(
|
|||
allMockedCreds.add(type);
|
||||
}
|
||||
|
||||
// `toolCalls` is the ordered timeline captured by the trace collector.
|
||||
// We count any tool call that errored OR returned a failed result —
|
||||
// hard Mastra tool failures are rare, but `submit-workflow` rejections
|
||||
// and `execute_command` returning a non-zero `tsc` exit are common and
|
||||
// dominate the "rough path" signal we care about. Suspensions are
|
||||
// benign (auto-approved or surfaced via `errorClass` separately).
|
||||
for (const tc of record.toolCalls) {
|
||||
toolCallsTotal++;
|
||||
if (isErroredToolCall(tc)) toolCallErrors++;
|
||||
if (tc.toolName === 'submit-workflow') submitCallsTotal++;
|
||||
}
|
||||
|
||||
const primary = record.feedback.find((f) => f.metric === 'pairwise_primary')?.score;
|
||||
if (typeof primary === 'number') {
|
||||
primaryPassSum += primary;
|
||||
|
|
@ -469,6 +516,11 @@ async function writeOutputs(
|
|||
buildFailures,
|
||||
primaryPassRate: primaryPassCount ? primaryPassSum / primaryPassCount : 0,
|
||||
avgDiagnostic: diagnosticCount ? diagnosticSum / diagnosticCount : 0,
|
||||
submitCallsTotal,
|
||||
avgSubmitCalls: records.length ? submitCallsTotal / records.length : 0,
|
||||
toolCallsTotal,
|
||||
toolCallErrors,
|
||||
toolCallErrorRate: toolCallsTotal ? toolCallErrors / toolCallsTotal : 0,
|
||||
},
|
||||
interactivity: {
|
||||
askUserCount,
|
||||
|
|
@ -645,6 +697,38 @@ function safeFilename(s: string): string {
|
|||
return s.replace(/[^a-zA-Z0-9._-]+/g, '_').slice(0, 120);
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether a tool call should count toward the "tool error rate" metric.
|
||||
*
|
||||
* Catches three flavours:
|
||||
* 1. **Hard Mastra failure** (`trace.error` set) — tool threw / rejected.
|
||||
* 2. **Tool returned a failed result object** — e.g. `submit-workflow`
|
||||
* returning `{ success: false, errors: [...] }`. Looks at top-level
|
||||
* `success === false` or non-empty `errors` array, plus a string
|
||||
* `error` field.
|
||||
* 3. **`execute_command` returned a non-zero exit code** — e.g. `tsc`
|
||||
* spitting out compile errors. Looks for an `Exit code: <non-zero>`
|
||||
* marker in the result text.
|
||||
*/
|
||||
function isErroredToolCall(trace: ToolCallTrace): boolean {
|
||||
if (trace.error !== undefined) return true;
|
||||
const r = trace.result;
|
||||
if (r === null || r === undefined) return false;
|
||||
|
||||
if (typeof r === 'object' && !Array.isArray(r)) {
|
||||
const obj = r as Record<string, unknown>;
|
||||
if (obj.success === false) return true;
|
||||
if (typeof obj.error === 'string' && obj.error.length > 0) return true;
|
||||
if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
|
||||
}
|
||||
|
||||
if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async function fileExists(filePath: string): Promise<boolean> {
|
||||
try {
|
||||
await fs.access(filePath);
|
||||
|
|
|
|||
|
|
@ -34,6 +34,11 @@ interface SummaryJson {
|
|||
buildFailures: Record<string, number>;
|
||||
primaryPassRate: number;
|
||||
avgDiagnostic: number;
|
||||
submitCallsTotal?: number;
|
||||
avgSubmitCalls?: number;
|
||||
toolCallsTotal?: number;
|
||||
toolCallErrors?: number;
|
||||
toolCallErrorRate?: number;
|
||||
};
|
||||
interactivity: {
|
||||
askUserCount: number;
|
||||
|
|
@ -168,6 +173,36 @@ function escapeAttr(input: string): string {
|
|||
return input.replace(/&/g, '&').replace(/'/g, ''').replace(/"/g, '"');
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether a tool call should count toward the "tool error rate" metric.
|
||||
* Mirrors `isErroredToolCall` in `pairwise.ts` — kept in sync by hand
|
||||
* because the report walks pre-saved `results.jsonl` files written by
|
||||
* older runs of the eval too.
|
||||
*/
|
||||
function isErroredToolCall(trace: ToolCallTrace): boolean {
|
||||
if (trace.error !== undefined) return true;
|
||||
const r = trace.result;
|
||||
if (r === null || r === undefined) return false;
|
||||
if (typeof r === 'object' && !Array.isArray(r)) {
|
||||
const obj = r as Record<string, unknown>;
|
||||
if (obj.success === false) return true;
|
||||
if (typeof obj.error === 'string' && obj.error.length > 0) return true;
|
||||
if (Array.isArray(obj.errors) && obj.errors.length > 0) return true;
|
||||
}
|
||||
if (typeof r === 'string' && /\bExit code:\s*[1-9]\d*\b/.test(r)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
function countSubmitCalls(traces: ToolCallTrace[] | undefined): number {
|
||||
if (!traces) return 0;
|
||||
return traces.filter((t) => t.toolName === 'submit-workflow').length;
|
||||
}
|
||||
|
||||
function countToolCallErrors(traces: ToolCallTrace[] | undefined): number {
|
||||
if (!traces) return 0;
|
||||
return traces.filter(isErroredToolCall).length;
|
||||
}
|
||||
|
||||
function findScore(feedback: FeedbackEntry[], metric: string): number | undefined {
|
||||
return feedback.find((f) => f.metric === metric)?.score;
|
||||
}
|
||||
|
|
@ -333,6 +368,15 @@ function renderExample(record: ResultRecord, idPrefix: string): string {
|
|||
if (interact.mockedCredentialTypes.length > 0)
|
||||
interactBits.push(`mocked creds: ${interact.mockedCredentialTypes.join(', ')}`);
|
||||
|
||||
// Per-record build-path stats. Surfaced inline in the summary line so a
|
||||
// reviewer can scan retries / errors without expanding each row. Numbers
|
||||
// match the columns added to `results.csv`.
|
||||
const submitCalls = countSubmitCalls(record.toolCalls);
|
||||
const toolErrors = countToolCallErrors(record.toolCalls);
|
||||
const buildStatBits: string[] = [];
|
||||
if (submitCalls > 0) buildStatBits.push(`submit ×${submitCalls}`);
|
||||
if (toolErrors > 0) buildStatBits.push(`err ×${toolErrors}`);
|
||||
|
||||
const errorBlock = record.build.errorMessage
|
||||
? `<div class="error">${escapeHtml(record.build.errorMessage)}</div>`
|
||||
: '';
|
||||
|
|
@ -349,6 +393,7 @@ function renderExample(record: ResultRecord, idPrefix: string): string {
|
|||
</div>
|
||||
<span class="iteration">#${record.iteration}</span>
|
||||
<span class="duration">${record.build.durationMs}ms</span>
|
||||
${buildStatBits.length > 0 ? `<span class="build-stats">${buildStatBits.map(escapeHtml).join(' · ')}</span>` : ''}
|
||||
<span class="badges">${renderFeedbackBadges(record.feedback)}</span>
|
||||
</summary>
|
||||
<div class="body">
|
||||
|
|
@ -412,6 +457,16 @@ function renderRun(run: Run, index: number): string {
|
|||
<span class="total ${totalFailures > 0 ? 'fail' : ''}"><strong>Build fail:</strong> ${totalFailures}${failureDetail ? ` (${escapeHtml(failureDetail)})` : ''}</span>
|
||||
<span class="total"><strong>Primary pass rate:</strong> ${pct(s.totals.primaryPassRate)}</span>
|
||||
<span class="total"><strong>Avg diagnostic:</strong> ${s.totals.avgDiagnostic.toFixed(2)}</span>
|
||||
${
|
||||
s.totals.toolCallErrorRate !== undefined
|
||||
? `<span class="total ${s.totals.toolCallErrorRate > 0.1 ? 'fail' : ''}"><strong>Tool error rate:</strong> ${pct(s.totals.toolCallErrorRate)}${s.totals.toolCallErrors !== undefined && s.totals.toolCallsTotal !== undefined ? ` (${s.totals.toolCallErrors}/${s.totals.toolCallsTotal})` : ''}</span>`
|
||||
: ''
|
||||
}
|
||||
${
|
||||
s.totals.avgSubmitCalls !== undefined
|
||||
? `<span class="total"><strong>Submit calls:</strong> ${s.totals.submitCallsTotal ?? 0} total, ${s.totals.avgSubmitCalls.toFixed(2)} avg/build</span>`
|
||||
: ''
|
||||
}
|
||||
</div>
|
||||
${
|
||||
s.interactivity.askUserCount > 0 ||
|
||||
|
|
@ -504,6 +559,7 @@ export function renderDocument(runs: Run[]): string {
|
|||
details.example > summary .example-id { font-family: ui-monospace, monospace; font-size: 11px; color: var(--muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
||||
details.example > summary .iteration { color: var(--muted); font-size: 11px; }
|
||||
details.example > summary .duration { color: var(--muted); font-size: 11px; text-align: right; }
|
||||
details.example > summary .build-stats { color: var(--muted); font-size: 11px; text-align: right; white-space: nowrap; }
|
||||
details.example > summary .badges { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-end; }
|
||||
.badge { font-size: 11px; padding: 2px 6px; border-radius: 3px; background: rgba(139,148,158,0.18); color: var(--fg); }
|
||||
.badge.badge-pass { background: rgba(63,185,80,0.2); color: var(--pass); }
|
||||
|
|
|
|||
|
|
@ -40,6 +40,10 @@ import path from 'node:path';
|
|||
import { normalizeWorkflow } from './normalize-workflow';
|
||||
import { stringifyError, truncate } from './redact';
|
||||
import { createStubServices, defaultNodesJsonPath, type StubServiceHandle } from './stub-services';
|
||||
import {
|
||||
createInMemoryWorkflowTaskService,
|
||||
type InMemoryWorkflowTaskService,
|
||||
} from './stub-workflow-task-service';
|
||||
import type { SimpleWorkflow } from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise';
|
||||
import { registerWithMastra } from '../../src/agent/register-with-mastra';
|
||||
import { MAX_STEPS } from '../../src/constants/max-steps';
|
||||
|
|
@ -48,9 +52,15 @@ import type { Logger } from '../../src/logger';
|
|||
import { executeResumableStream } from '../../src/runtime/resumable-stream-executor';
|
||||
import { createAllTools } from '../../src/tools';
|
||||
import { createSandboxBuilderAgentPrompt } from '../../src/tools/orchestration/build-workflow-agent.prompt';
|
||||
import { createSubmitWorkflowTool } from '../../src/tools/workflows/submit-workflow.tool';
|
||||
import type { ModelConfig } from '../../src/types';
|
||||
import { createVerifyBuiltWorkflowTool } from '../../src/tools/orchestration/verify-built-workflow.tool';
|
||||
import {
|
||||
createSubmitWorkflowTool,
|
||||
type SubmitWorkflowAttempt,
|
||||
} from '../../src/tools/workflows/submit-workflow.tool';
|
||||
import type { ModelConfig, OrchestrationContext } from '../../src/types';
|
||||
import { asResumable } from '../../src/utils/stream-helpers';
|
||||
import { createRemediation } from '../../src/workflow-loop/remediation';
|
||||
import type { WorkflowBuildOutcome } from '../../src/workflow-loop/workflow-loop-state';
|
||||
import type {
|
||||
BuilderSandboxFactory,
|
||||
BuilderWorkspace,
|
||||
|
|
@ -130,6 +140,15 @@ export interface BuildInProcessOptions {
|
|||
* `WorkflowJSON`. The workspace is destroyed on completion.
|
||||
*/
|
||||
sandboxFactory: BuilderSandboxFactory;
|
||||
/**
|
||||
* Optional pre-generated work item ID. Pass this when the caller has
|
||||
* already embedded `[WORK ITEM ID: ${workItemId}]` into the prompt's
|
||||
* briefing — `verify-built-workflow` reads the same value back from the
|
||||
* in-memory `workflowTaskService` keyed on this ID. When omitted, a
|
||||
* fresh ID is generated; in that case `verify-built-workflow` won't be
|
||||
* called by the agent (the briefing didn't tell it what value to pass).
|
||||
*/
|
||||
workItemId?: string;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -230,6 +249,35 @@ export async function buildInProcess(
|
|||
}
|
||||
const prompt = createSandboxBuilderAgentPrompt(root);
|
||||
|
||||
// Per-build identifiers — match what production (`build-workflow-agent.tool.ts`)
|
||||
// generates per orchestrator-dispatched task. The builder agent reads
|
||||
// `workItemId` from the briefing's `additionalContext`, then passes it to
|
||||
// `verify-built-workflow` to round-trip its build outcome.
|
||||
const workItemId = options.workItemId ?? 'wi_' + nanoid(8);
|
||||
const taskId = 'eval-task-' + nanoid(6);
|
||||
const threadId = 'eval-thread-' + nanoid(6);
|
||||
const runId = 'eval-run-' + nanoid(6);
|
||||
const agentId = 'eval-builder-' + nanoid(6);
|
||||
const logger = silentLogger();
|
||||
|
||||
// In-memory build-outcome / verification store. Lives for the duration
|
||||
// of this single build; never shared. The workflowTaskService interface
|
||||
// is what `verify-built-workflow` reads from after `submit-workflow`
|
||||
// records the attempt below.
|
||||
const workflowTaskService: InMemoryWorkflowTaskService = createInMemoryWorkflowTaskService();
|
||||
|
||||
// Minimal OrchestrationContext shim for `createVerifyBuiltWorkflowTool`.
|
||||
// Verify-built-workflow only reads `workflowTaskService`, `domainContext`,
|
||||
// `runId`, and `logger` at runtime — the rest of OrchestrationContext is
|
||||
// orchestrator scaffolding the builder doesn't touch.
|
||||
const verifyContext = {
|
||||
threadId,
|
||||
runId,
|
||||
logger,
|
||||
domainContext: services.context,
|
||||
workflowTaskService,
|
||||
} as unknown as OrchestrationContext;
|
||||
|
||||
const sandboxToolNames = [
|
||||
'nodes',
|
||||
'workflows',
|
||||
|
|
@ -241,9 +289,23 @@ export async function buildInProcess(
|
|||
const tool = (allTools as Record<string, unknown>)[name];
|
||||
if (tool) builderTools[name] = tool;
|
||||
}
|
||||
builderTools['submit-workflow'] = createSubmitWorkflowTool(services.context, builderWs.workspace);
|
||||
|
||||
const agentId = 'eval-builder-' + nanoid(6);
|
||||
// `submit-workflow` reports each attempt back via the onAttempt callback.
|
||||
// Production wires this to `workflowTaskService.reportBuildOutcome` so the
|
||||
// builder loop and `verify-built-workflow` can read it. We mirror that
|
||||
// here so the same prompt contract works in eval.
|
||||
builderTools['submit-workflow'] = createSubmitWorkflowTool(
|
||||
services.context,
|
||||
builderWs.workspace,
|
||||
undefined,
|
||||
async (attempt) => {
|
||||
await workflowTaskService.reportBuildOutcome(
|
||||
toWorkflowBuildOutcome(workItemId, runId, taskId, attempt),
|
||||
);
|
||||
},
|
||||
);
|
||||
builderTools['verify-built-workflow'] = createVerifyBuiltWorkflowTool(verifyContext);
|
||||
|
||||
const agent = new Agent({
|
||||
id: agentId,
|
||||
name: 'Eval Workflow Builder',
|
||||
|
|
@ -266,14 +328,11 @@ export async function buildInProcess(
|
|||
|
||||
const abortController = new AbortController();
|
||||
const timeoutHandle = setTimeout(() => abortController.abort(), timeoutMs);
|
||||
const threadId = 'eval-thread-' + nanoid(6);
|
||||
const runId = 'eval-run-' + nanoid(6);
|
||||
const eventBus = wrapEventBusWithObserver(createInMemoryEventBus(), (event) => {
|
||||
observeEvent(event, interactivity);
|
||||
traceCollector.observe(event);
|
||||
chunkLog?.writeEvent(event);
|
||||
});
|
||||
const logger = silentLogger();
|
||||
|
||||
let finalText: string | undefined;
|
||||
try {
|
||||
|
|
@ -310,6 +369,24 @@ export async function buildInProcess(
|
|||
interactivity.askUserCount++;
|
||||
}
|
||||
},
|
||||
// Match production (`consumeStreamWithHitl`): when a suspension
|
||||
// auto-resumes, pass `maxSteps` and the same providerOptions to
|
||||
// `resumeStream`. Without these, Mastra's `resumeStream` defaults
|
||||
// to its built-in `stepCountIs(5)` cap — which silently truncates
|
||||
// the agent's post-suspension work after every HITL tool. In a
|
||||
// builder run that creates data tables before writing the file,
|
||||
// the resume budget gets eaten by the time the agent reaches
|
||||
// `submit-workflow`, and the run dies mid-flow with a stale
|
||||
// `finishReason: 'suspended'`. See `consume-with-hitl.ts` for
|
||||
// the production wiring.
|
||||
buildResumeOptions: ({ mastraRunId, suspension }) => ({
|
||||
runId: mastraRunId,
|
||||
toolCallId: suspension.toolCallId,
|
||||
maxSteps,
|
||||
providerOptions: {
|
||||
anthropic: { cacheControl: { type: 'ephemeral' as const } },
|
||||
},
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
|
|
@ -799,3 +876,62 @@ async function safeSettle<T>(value: Promise<T> | undefined): Promise<T | undefin
|
|||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a `submit-workflow` attempt into a `WorkflowBuildOutcome`.
|
||||
*
|
||||
* Production's `build-workflow-agent.tool.ts` does the same thing inside the
|
||||
* orchestrator. We mirror it here (minus orchestrator-only fields like
|
||||
* triggerType detection) so `verify-built-workflow` finds a sensible outcome
|
||||
* stored against the workItemId.
|
||||
*/
|
||||
function toWorkflowBuildOutcome(
|
||||
workItemId: string,
|
||||
runId: string,
|
||||
taskId: string,
|
||||
attempt: SubmitWorkflowAttempt,
|
||||
): WorkflowBuildOutcome {
|
||||
if (!attempt.success) {
|
||||
return {
|
||||
workItemId,
|
||||
runId,
|
||||
taskId,
|
||||
submitted: false,
|
||||
triggerType: 'manual_or_testable',
|
||||
needsUserInput: false,
|
||||
failureSignature: attempt.errors?.join('; '),
|
||||
remediation: attempt.remediation,
|
||||
summary: attempt.errors?.join(' ') ?? 'Workflow submission failed.',
|
||||
};
|
||||
}
|
||||
const placeholderRemediation = attempt.hasUnresolvedPlaceholders
|
||||
? createRemediation({
|
||||
category: 'needs_setup',
|
||||
shouldEdit: false,
|
||||
reason: 'mocked_credentials_or_placeholders',
|
||||
guidance:
|
||||
'Workflow submitted successfully, but unresolved setup values remain. Stop code edits.',
|
||||
})
|
||||
: undefined;
|
||||
return {
|
||||
workItemId,
|
||||
runId,
|
||||
taskId,
|
||||
workflowId: attempt.workflowId,
|
||||
submitted: true,
|
||||
// Eval doesn't run trigger-aware verification, so the value here is
|
||||
// cosmetic — the verify tool branches on `executionService.run` result,
|
||||
// not this field.
|
||||
triggerType: 'manual_or_testable',
|
||||
needsUserInput: Boolean(placeholderRemediation),
|
||||
blockingReason: placeholderRemediation?.guidance,
|
||||
mockedNodeNames: attempt.mockedNodeNames,
|
||||
mockedCredentialTypes: attempt.mockedCredentialTypes,
|
||||
mockedCredentialsByNode: attempt.mockedCredentialsByNode,
|
||||
triggerNodes: attempt.triggerNodes,
|
||||
verificationPinData: attempt.verificationPinData,
|
||||
hasUnresolvedPlaceholders: attempt.hasUnresolvedPlaceholders,
|
||||
remediation: placeholderRemediation ?? attempt.remediation,
|
||||
summary: 'Workflow submitted and ready for verification.',
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -189,8 +189,22 @@ export async function createStubServices(
|
|||
async list() {
|
||||
return [];
|
||||
},
|
||||
async run() {
|
||||
return stubExecutionResult('stub: execution disabled in eval');
|
||||
// `verify-built-workflow` invokes `executionService.run()` after
|
||||
// `submit-workflow` has captured the TS-compiled workflow JSON. The eval
|
||||
// has no execution backend, but we want the builder agent's submit →
|
||||
// verify → done sequence to complete cleanly so the production briefing
|
||||
// (`DETACHED_BUILDER_REQUIREMENTS`) reads coherently. Returning a
|
||||
// synthetic success here lets the agent terminate after submit. The
|
||||
// eval's `buildSuccess` metric is derived from `submit-workflow` capture
|
||||
// — never from this synthetic verdict — so this can't inflate the score.
|
||||
async run(workflowId) {
|
||||
return {
|
||||
executionId: 'eval-exec-' + nanoid(),
|
||||
status: 'success' as const,
|
||||
data: { __eval_synthetic_verify__: [{ workflowId }] },
|
||||
startedAt: new Date().toISOString(),
|
||||
finishedAt: new Date().toISOString(),
|
||||
};
|
||||
},
|
||||
async getStatus() {
|
||||
return stubExecutionResult('stub: execution disabled in eval');
|
||||
|
|
|
|||
|
|
@ -0,0 +1,85 @@
|
|||
// ---------------------------------------------------------------------------
|
||||
// In-memory `WorkflowTaskService` stub for the in-process eval harness.
|
||||
//
|
||||
// Production wires `workflowTaskService` through `instance-ai.service.ts` so
|
||||
// the orchestrator can persist build outcomes per `workItemId` and the
|
||||
// builder sub-agent can read them back via `verify-built-workflow`. The eval
|
||||
// has no persistence layer, so we mirror the production interface against an
|
||||
// in-memory map. This is enough for the production builder briefing
|
||||
// (`DETACHED_BUILDER_REQUIREMENTS`) to read coherently:
|
||||
//
|
||||
// submit-workflow → reportBuildOutcome (writes to map)
|
||||
// verify-built-workflow → getBuildOutcome (reads from map) + executes
|
||||
// verify result → updateBuildOutcome (writes verification record)
|
||||
//
|
||||
// Each `buildInProcess` call gets its own service instance — no cross-build
|
||||
// state leaks.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/* eslint-disable @typescript-eslint/require-await */
|
||||
// All `WorkflowTaskService` methods are interface-async even when the
|
||||
// implementation is synchronous in-memory bookkeeping.
|
||||
|
||||
import type { WorkflowTaskService } from '../../src/types';
|
||||
import type {
|
||||
VerificationResult,
|
||||
WorkflowBuildOutcome,
|
||||
WorkflowLoopAction,
|
||||
WorkflowLoopState,
|
||||
} from '../../src/workflow-loop/workflow-loop-state';
|
||||
|
||||
export interface InMemoryWorkflowTaskService extends WorkflowTaskService {
|
||||
/** Read-only access to the latest stored outcome — used by callers that
|
||||
* want to inspect what the agent ended up with after the run. */
|
||||
peekOutcome(workItemId: string): WorkflowBuildOutcome | undefined;
|
||||
/** Read-only access to the latest stored verification verdict. */
|
||||
peekVerdict(workItemId: string): VerificationResult | undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a fresh in-memory WorkflowTaskService.
|
||||
*
|
||||
* `reportBuildOutcome` and `reportVerificationVerdict` always return
|
||||
* `{ type: 'ignored', reason: 'eval-mode' }` because the eval has no
|
||||
* workflow-loop controller — there's no rebuild/verify state machine to
|
||||
* advance. The builder agent only needs the read-back paths to work.
|
||||
*/
|
||||
export function createInMemoryWorkflowTaskService(): InMemoryWorkflowTaskService {
|
||||
const outcomes = new Map<string, WorkflowBuildOutcome>();
|
||||
const verdicts = new Map<string, VerificationResult>();
|
||||
|
||||
return {
|
||||
async reportBuildOutcome(outcome) {
|
||||
outcomes.set(outcome.workItemId, outcome);
|
||||
return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction;
|
||||
},
|
||||
|
||||
async reportVerificationVerdict(verdict) {
|
||||
verdicts.set(verdict.workItemId, verdict);
|
||||
return { type: 'ignored', reason: 'eval-mode' } satisfies WorkflowLoopAction;
|
||||
},
|
||||
|
||||
async getBuildOutcome(workItemId) {
|
||||
return outcomes.get(workItemId);
|
||||
},
|
||||
|
||||
async getWorkflowLoopState(_workItemId): Promise<WorkflowLoopState | undefined> {
|
||||
// Eval has no loop controller — verify-built-workflow tolerates undefined.
|
||||
return undefined;
|
||||
},
|
||||
|
||||
async updateBuildOutcome(workItemId, update) {
|
||||
const existing = outcomes.get(workItemId);
|
||||
if (!existing) return;
|
||||
outcomes.set(workItemId, { ...existing, ...update });
|
||||
},
|
||||
|
||||
peekOutcome(workItemId) {
|
||||
return outcomes.get(workItemId);
|
||||
},
|
||||
|
||||
peekVerdict(workItemId) {
|
||||
return verdicts.get(workItemId);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
@ -304,7 +304,7 @@ async function buildOutcomeWithLatestVerification(
|
|||
return await finalBuildOutcome(context, workItemId, outcome);
|
||||
}
|
||||
|
||||
const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract
|
||||
export const DETACHED_BUILDER_REQUIREMENTS = `## Detached Task Contract
|
||||
|
||||
You are running as a detached background task. Do not stop after a successful submit — verify the workflow works.
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user