n8n/packages/@n8n/instance-ai/evaluations/cli/index.ts
José Braulio González Valido 700b32237f
feat(ai-builder): Surface WHAT-dimension binary checks per built workflow (no-changelog) (#30932)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 12:18:52 +01:00

1250 lines
43 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
// ---------------------------------------------------------------------------
// Instance AI workflow eval CLI
//
// Runs workflow execution evaluations. When LANGSMITH_API_KEY is set, uses
// LangSmith's evaluate() for experiment tracking and tracing. Otherwise
// falls back to a direct loop with the same eval-results.json output.
// ---------------------------------------------------------------------------
import type { InstanceAiEvalExecutionResult } from '@n8n/api-types';
import { mkdirSync, writeFileSync } from 'fs';
import { Client } from 'langsmith';
import { evaluate } from 'langsmith/evaluation';
import type { EvaluationResult } from 'langsmith/evaluation';
import type { Example, Run } from 'langsmith/schemas';
import { traceable } from 'langsmith/traceable';
import { join } from 'path';
import { z } from 'zod';
import { aggregateResults, passAtK, passHatK } from './aggregator';
import { parseCliArgs } from './args';
import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
import { LaneAllocator } from './lane-allocator';
import { expandWithIterations, partitionRoundRobin } from './lanes';
import { aggregateWorkflowChecks, statusMap } from '../binaryChecks/aggregate';
import { CHECK_DIMENSIONS, type CheckOutcome } from '../binaryChecks/types';
import { N8nClient } from '../clients/n8n-client';
import {
compareBuckets,
type ComparisonOutcome,
type ComparisonResult,
type ExperimentBucket,
type ScenarioCounts,
} from '../comparison/compare';
import { fetchBaselineBucket, findLatestBaseline } from '../comparison/fetch-baseline';
import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
import type { WorkflowTestCaseWithFile } from '../data/workflows';
import { createLogger } from '../harness/logger';
import type { EvalLogger } from '../harness/logger';
import {
fetchPrebuiltBuild,
loadPrebuiltManifest,
pickPrebuiltWorkflowId,
type PrebuiltManifest,
} from '../harness/prebuilt-workflows';
import {
buildWorkflow,
executeScenario,
cleanupBuild,
runWorkflowChecks,
runWorkflowTestCase,
runWithConcurrency,
type BuildResult,
} from '../harness/runner';
import { syncDataset, type DatasetExampleInputs } from '../langsmith/dataset-sync';
import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
import { writeWorkflowReport } from '../report/workflow-report';
import type {
MultiRunEvaluation,
ExecutionScenarioResult,
ExecutionScenario,
TranscriptTurn,
WorkflowTestCase,
WorkflowTestCaseResult,
} from '../types';
// n8n degrades above ~4 concurrent builds.
const MAX_CONCURRENT_BUILDS = 4;
const checkOutcomeSchema = z.object({
name: z.string(),
description: z.string(),
kind: z.enum(['deterministic', 'llm']),
dimension: z.enum(CHECK_DIMENSIONS),
status: z.enum(['pass', 'fail', 'n_a']),
comment: z.string().optional(),
});
const targetOutputSchema = z.object({
buildSuccess: z.boolean().default(false),
passed: z.boolean().default(false),
score: z.number().default(0),
reasoning: z.string().default(''),
workflowId: z.string().optional(),
failureCategory: z.string().optional(),
rootCause: z.string().optional(),
execErrors: z.array(z.string()).default([]),
evalResult: z.unknown().optional(),
/** Only set on the scenario that initiated the build. */
buildDurationMs: z.number().optional(),
execDurationMs: z.number().default(0),
nodeCount: z.number().default(0),
/** The thread id used during the build — keys the LangSmith trace lookup. */
threadId: z.string().optional(),
workflowChecks: z.array(checkOutcomeSchema).optional(),
});
type TargetOutput = Omit<z.infer<typeof targetOutputSchema>, 'evalResult'> & {
evalResult?: InstanceAiEvalExecutionResult;
};
function isPlainObject(v: unknown): v is Record<string, unknown> {
return typeof v === 'object' && v !== null && !Array.isArray(v);
}
function isEvalResult(v: unknown): v is InstanceAiEvalExecutionResult {
if (!isPlainObject(v)) return false;
return (
typeof v.nodeResults === 'object' &&
v.nodeResults !== null &&
Array.isArray(v.errors) &&
typeof v.hints === 'object' &&
v.hints !== null
);
}
/** Safe-parse a run's outputs. Returns `undefined` if the row is malformed
* or missing, so callers can skip it instead of treating it as a genuine
* failed evaluation. Every field in the schema has a default, so an empty
* or nullish raw value would otherwise parse successfully into a "failed"
* shape (passed:false, score:0) — masking infra errors as builder regressions.
*/
function parseTargetOutput(raw: unknown): TargetOutput | undefined {
if (!isPlainObject(raw) || Object.keys(raw).length === 0) return undefined;
const parsed = targetOutputSchema.safeParse(raw);
if (!parsed.success) return undefined;
return {
...parsed.data,
evalResult: isEvalResult(parsed.data.evalResult) ? parsed.data.evalResult : undefined,
};
}
const runInputsSchema = z
.object({
testCaseFile: z.string().default(''),
scenarioName: z.string().default(''),
/** 0-based iteration index; injected during multi-run expansion. */
_iteration: z.number().int().nonnegative().default(0),
})
.passthrough();
/** Target input shape with the iteration index we inject for multi-run. */
type TargetInputs = DatasetExampleInputs & { _iteration?: number };
interface Lane {
client: N8nClient;
preRunWorkflowIds: Set<string>;
claimedWorkflowIds: Set<string>;
seedResult: { seededTypes: string[]; credentialIds: string[] };
}
interface RunConfig {
args: ReturnType<typeof parseCliArgs>;
lanes: Lane[];
logger: EvalLogger;
prebuiltManifest?: PrebuiltManifest;
}
async function main(): Promise<void> {
const args = parseCliArgs(process.argv.slice(2));
const logger = createLogger(args.verbose);
const prebuiltManifest = args.prebuiltWorkflows
? loadPrebuiltManifest(args.prebuiltWorkflows)
: undefined;
if (prebuiltManifest) {
// Multi-lane is for distributing the orchestrator build phase across
// n8n instances. Prebuilt workflows live on a single instance — fetching
// them from any other lane's URL would 404 — and prebuilt mode skips
// builds anyway, so multi-lane buys nothing. Refuse the combination
// rather than silently fetching from one lane and ignoring the rest.
if (args.baseUrls.length > 1) {
throw new Error(
'--prebuilt-workflows is incompatible with multiple --base-url values. Prebuilt workflows live on a single n8n instance; pass exactly one --base-url.',
);
}
const slugCount = Object.keys(prebuiltManifest).length;
logger.info(`Loaded prebuilt manifest: ${String(slugCount)} test case(s)`);
// Warn on slugs that don't match a local test-case file. Common cause
// is typos in the manifest — without this check, the typo silently
// falls through to an orchestrator build (or no run at all), and the
// user thinks the prebuilt path ran when it didn't.
const localSlugs = new Set(loadWorkflowTestCasesWithFiles().map((tc) => tc.fileSlug));
const orphanSlugs = Object.keys(prebuiltManifest).filter((slug) => !localSlugs.has(slug));
if (orphanSlugs.length > 0) {
logger.warn(
`Prebuilt manifest references ${String(orphanSlugs.length)} slug(s) with no matching local test case (will be ignored): ${orphanSlugs.join(', ')}`,
);
}
}
// One lane per base URL. The LangSmith path then uses a work-stealing
// allocator (lane-allocator.ts) to dispatch builds across lanes; the direct
// path partitions test cases statically per lane.
const lanes: Lane[] = await Promise.all(
args.baseUrls.map(async (baseUrl, idx) => {
const tag =
args.baseUrls.length > 1
? ` [lane ${String(idx + 1)}/${String(args.baseUrls.length)}]`
: '';
const client = new N8nClient(baseUrl);
logger.info(`Authenticating with ${baseUrl}...${tag}`);
await client.login(args.email, args.password);
logger.success(`Authenticated${tag}`);
logger.info(`Seeding credentials...${tag}`);
const seedResult = await seedCredentials(client, undefined, logger);
logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)${tag}`);
const preRunWorkflowIds = await snapshotWorkflowIds(client);
const claimedWorkflowIds = new Set<string>();
return { client, preRunWorkflowIds, claimedWorkflowIds, seedResult };
}),
);
const startTime = Date.now();
try {
const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);
let evaluation: MultiRunEvaluation;
let experimentName: string | undefined;
let outcome: ComparisonOutcome | undefined;
let slugByTestCase: Map<WorkflowTestCase, string> | undefined;
if (hasLangSmith) {
logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
const langsmithRun = await runWithLangSmith({ args, lanes, logger, prebuiltManifest });
evaluation = langsmithRun.evaluation;
experimentName = langsmithRun.experimentName;
outcome = langsmithRun.outcome;
slugByTestCase = langsmithRun.slugByTestCase;
} else {
logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
evaluation = await runDirectLoop({ args, lanes, logger, prebuiltManifest });
}
const totalDuration = Date.now() - startTime;
const commitSha = process.env.LANGSMITH_REVISION_ID ?? process.env.GITHUB_SHA;
const { jsonPath, prCommentPath } = writeEvalResults(
evaluation,
totalDuration,
args.outputDir,
experimentName,
outcome,
commitSha,
slugByTestCase,
);
console.log(`Results: ${jsonPath}`);
console.log(`PR comment: ${prCommentPath}`);
const reportResults = flattenRunsForReport(evaluation);
const htmlPath = writeWorkflowReport(reportResults);
console.log(`Report: ${htmlPath}`);
console.log(
'\n' + formatComparisonTerminal(evaluation, outcome, { commitSha, slugByTestCase }),
);
} finally {
await Promise.all(
lanes.map(async (lane) => {
await cleanupCredentials(lane.client, lane.seedResult.credentialIds).catch(() => {});
}),
);
}
}
// ---------------------------------------------------------------------------
// LangSmith mode: evaluate() with dataset sync, tracing, experiments
// ---------------------------------------------------------------------------
async function runWithLangSmith(config: RunConfig): Promise<{
evaluation: MultiRunEvaluation;
experimentName: string;
outcome: ComparisonOutcome;
slugByTestCase: Map<WorkflowTestCase, string>;
}> {
const { args, lanes, logger, prebuiltManifest } = config;
const lsClient = new Client();
const datasetName = await syncDataset(lsClient, args.dataset, logger, args.filter, args.exclude);
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter, args.exclude);
// Stash transcripts by threadId so reshapeLangSmithRuns can merge them in —
// the LangSmith target() output schema doesn't carry the full transcript.
const transcriptByThreadId = new Map<string, TranscriptTurn[]>();
// LangSmith dataset rows carry only per-scenario fields. The conversation
// for the build is sourced locally, keyed by fileSlug.
const conversationByFileSlug = new Map<
string,
{ conversation: WorkflowTestCase['conversation']; messageBudget?: number }
>();
for (const { testCase, fileSlug } of testCasesWithFiles) {
conversationByFileSlug.set(fileSlug, {
conversation: testCase.conversation,
messageBudget: testCase.messageBudget,
});
}
// LaneState carries the allocator-managed counters (activeBuilds,
// inflightKeys) plus the lane's traced LangSmith wrappers. `runner` is
// the underlying Lane (n8n client, credential state) — named distinctly so
// it doesn't shadow the iteration variable `lane` in lanes.map().
interface LaneState {
runner: Lane;
activeBuilds: number;
inflightKeys: Set<string>;
tracedBuild: (buildArgs: {
conversation: WorkflowTestCase['conversation'];
messageBudget?: number;
}) => Promise<BuildResult>;
tracedExecute: (execArgs: {
workflowId: string;
scenario: ExecutionScenario;
workflowJsons: BuildResult['workflowJsons'];
}) => Promise<Awaited<ReturnType<typeof executeScenario>>>;
}
const laneStates: LaneState[] = lanes.map((lane, idx) => {
const laneNum = idx + 1;
const laneTag = lanes.length > 1 ? ` [lane ${String(laneNum)}/${String(lanes.length)}]` : '';
return {
runner: lane,
activeBuilds: 0,
inflightKeys: new Set<string>(),
tracedBuild: traceable(
async (buildArgs: {
conversation: WorkflowTestCase['conversation'];
messageBudget?: number;
}) =>
await buildWorkflow({
client: lane.client,
conversation: buildArgs.conversation,
messageBudget: buildArgs.messageBudget,
timeoutMs: args.timeoutMs,
preRunWorkflowIds: lane.preRunWorkflowIds,
claimedWorkflowIds: lane.claimedWorkflowIds,
logger,
laneTag,
}),
{
name: 'workflow_build',
run_type: 'chain',
client: lsClient,
metadata: { lane: laneNum },
},
),
tracedExecute: traceable(
async (execArgs: {
workflowId: string;
scenario: ExecutionScenario;
workflowJsons: BuildResult['workflowJsons'];
}) =>
await executeScenario(
lane.client,
execArgs.workflowId,
execArgs.scenario,
execArgs.workflowJsons,
logger,
args.timeoutMs,
),
{
name: 'scenario_execution',
run_type: 'chain',
client: lsClient,
metadata: { lane: laneNum },
},
),
};
});
// Work-stealing: each build acquires a lane that isn't already running its
// fileSlug, runs there (capped per-lane), then releases. Scenarios re-use the
// lane that built their workflow.
const allocator = new LaneAllocator(laneStates, MAX_CONCURRENT_BUILDS);
const buildCache = new Map<
string,
Promise<{ build: BuildResult; lane: LaneState; buildDurationMs: number }>
>();
const buildDurations = new Map<string, number>();
async function getOrBuild(
iteration: number,
fileSlug: string,
): Promise<{ build: BuildResult; lane: LaneState; buildDurationMs: number }> {
// Cache key on (iteration, fileSlug) — every scenario in a test-case file
// shares this build, and prebuilt + orchestrator-built paths use the same key.
const key = `${String(iteration)}:${fileSlug}`;
const existing = buildCache.get(key);
if (existing) return await existing;
const promise = (async () => {
const prebuiltId = pickPrebuiltWorkflowId(prebuiltManifest, fileSlug, iteration);
if (prebuiltId !== undefined) {
// Prebuilt path: no orchestrator concurrency to manage — just
// fetch the workflow. main() rejects multi-lane + prebuilt at
// startup, so laneStates always has exactly one entry here.
const lane = laneStates[0];
const start = Date.now();
const build = await fetchPrebuiltBuild(lane.runner.client, prebuiltId, logger);
const buildDurationMs = Date.now() - start;
buildDurations.set(key, buildDurationMs);
stashTranscript(build);
if (build.success && !build.workflowChecks) {
// No transcript in prebuilt mode — checks run with empty prompt context.
build.workflowChecks = await runWorkflowChecks({
workflow: build.workflowJsons[0],
prompt: '',
agentText: undefined,
logger,
});
}
return { build, lane, buildDurationMs };
}
// Orchestrator path: allocator spreads distinct fileSlugs across lanes;
// the build cache dedupes scenarios within one file.
const lane = await allocator.acquire(fileSlug);
const entry = conversationByFileSlug.get(fileSlug);
if (!entry) throw new Error(`No conversation found for fileSlug=${fileSlug}`);
try {
const start = Date.now();
const build = await lane.tracedBuild({
conversation: entry.conversation,
messageBudget: entry.messageBudget,
});
const buildDurationMs = Date.now() - start;
buildDurations.set(key, buildDurationMs);
stashTranscript(build);
return { build, lane, buildDurationMs };
} finally {
allocator.release(lane, fileSlug);
}
})();
buildCache.set(key, promise);
return await promise;
}
function stashTranscript(build: BuildResult): void {
if (build.threadId && build.transcript) {
transcriptByThreadId.set(build.threadId, build.transcript);
}
}
const target = async (inputs: TargetInputs): Promise<TargetOutput> => {
const iteration = inputs._iteration ?? 0;
const scenario: ExecutionScenario = {
name: inputs.scenarioName,
description: inputs.scenarioDescription,
dataSetup: inputs.dataSetup,
successCriteria: inputs.successCriteria,
};
const {
build,
lane: builtOnLane,
buildDurationMs,
} = await getOrBuild(iteration, inputs.testCaseFile);
if (!build.success || !build.workflowId) {
return {
buildSuccess: false,
passed: false,
score: 0,
reasoning: `Build failed: ${build.error ?? 'unknown'}`,
failureCategory: 'build_failure',
execErrors: build.error ? [build.error] : [],
buildDurationMs,
execDurationMs: 0,
nodeCount: 0,
threadId: build.threadId,
workflowChecks: build.workflowChecks,
};
}
const execStart = Date.now();
const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
let result;
try {
result = await builtOnLane.tracedExecute({
workflowId: build.workflowId,
scenario,
workflowJsons: build.workflowJsons,
});
} catch (error: unknown) {
// Mirror direct mode's per-scenario guard — without this, n8n API errors
// or verifier timeouts from executeWithLlmMock / verifyChecklist would
// escape to LangSmith, come back as a Run with null outputs, and be
// misclassified as builder regressions by the feedback extractor.
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
return {
buildSuccess: true,
workflowId: build.workflowId,
passed: false,
score: 0,
reasoning: `Scenario execution error: ${errorMessage}`,
failureCategory: 'framework_issue',
execErrors: [errorMessage],
buildDurationMs,
execDurationMs: Date.now() - execStart,
nodeCount,
workflowChecks: build.workflowChecks,
};
}
const execDurationMs = Date.now() - execStart;
// Strip failure fields on pass: the verifier sometimes returns "."
// placeholders instead of omitting them.
const failureCategory = result.success ? undefined : result.failureCategory;
const rootCause = result.success ? undefined : result.rootCause;
return {
buildSuccess: true,
workflowId: build.workflowId,
passed: result.success,
score: result.score,
reasoning: result.reasoning,
failureCategory,
rootCause,
execErrors: result.evalResult?.errors ?? [],
evalResult: result.evalResult,
buildDurationMs,
execDurationMs,
nodeCount,
threadId: build.threadId,
workflowChecks: build.workflowChecks,
};
};
const feedbackExtractor = ({ run }: { run: Run }): EvaluationResult[] => {
const output = parseTargetOutput(run.outputs);
if (!output) return [];
// 'none' for passed scenarios so the column shows a full categorical
// breakdown instead of blank cells.
const failureCategory = output.passed ? 'none' : (output.failureCategory ?? 'unknown');
const feedback: EvaluationResult[] = [
{
key: 'scenario_pass',
score: output.score,
comment: output.reasoning || undefined,
},
{
key: 'failure_category',
value: failureCategory,
},
{
key: 'exec_duration_s',
score: output.execDurationMs / 1000,
},
{
key: 'node_count',
score: output.nodeCount,
},
];
if (output.buildDurationMs !== undefined) {
feedback.push({ key: 'build_duration_s', score: output.buildDurationMs / 1000 });
}
// Skip N/A so LangSmith column averages reduce to per-check pass-rate.
if (output.workflowChecks) {
for (const outcome of output.workflowChecks) {
if (outcome.status === 'n_a') continue;
feedback.push({
key: `evals.workflows.${outcome.dimension}.${outcome.name}`,
score: outcome.status === 'pass' ? 1 : 0,
comment: outcome.comment ?? undefined,
});
}
}
return feedback;
};
const experimentPrefix = args.experimentName ?? computeExperimentPrefix();
logger.info(
`Starting evaluate() with concurrency=${String(args.concurrency)}, ${String(lanes.length)} lane(s) × ${String(MAX_CONCURRENT_BUILDS)} concurrent builds, iterations=${String(args.iterations)}`,
);
// Always filter the LangSmith dataset by the local file slugs. The local
// JSON files are the source of truth; the dataset accumulates orphans (the
// sync is additive — see langsmith/dataset-sync.ts) and we don't want to
// run scenarios whose JSON file no longer exists.
const sourceExamples = filteredExamplesIterable(
lsClient,
datasetName,
args.filter,
args.exclude,
logger,
);
const evaluateData =
args.iterations > 1
? expandExamplesForIterations(sourceExamples, args.iterations)
: sourceExamples;
try {
const evaluateStart = Date.now();
const experimentResults = await evaluate(target, {
data: evaluateData,
evaluators: [feedbackExtractor],
experimentPrefix,
maxConcurrency: args.concurrency,
client: lsClient,
metadata: {
filter: args.filter ?? 'all',
exclude: args.exclude ?? null,
prebuilt: prebuiltManifest !== undefined,
concurrency: args.concurrency,
maxBuilds: MAX_CONCURRENT_BUILDS,
lanes: lanes.length,
iterations: args.iterations,
...buildCIMetadata(),
},
});
const totalDurationMs = Date.now() - evaluateStart;
logger.info(`Experiment: ${experimentResults.experimentName}`);
await lsClient.awaitPendingTraceBatches();
const allRunResults = reshapeLangSmithRuns(
experimentResults.results,
testCasesWithFiles,
args.iterations,
transcriptByThreadId,
);
const evaluation = aggregateResults(allRunResults, args.iterations);
await updateExperimentAggregates({
lsClient,
experimentName: experimentResults.experimentName,
runs: experimentResults.results,
evaluation,
buildDurations,
totalDurationMs,
logger,
});
await writePerRunPassMetrics({
lsClient,
runs: experimentResults.results,
logger,
});
const outcome = await tryRunComparison({
lsClient,
prExperimentName: experimentResults.experimentName,
evaluation,
testCasesWithFiles,
logger,
});
const slugByTestCase = new Map<WorkflowTestCase, string>(
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
);
return {
evaluation,
experimentName: experimentResults.experimentName,
outcome,
slugByTestCase,
};
} finally {
if (!args.keepWorkflows) {
await Promise.all(
[...buildCache.values()].map(async (promise) => {
try {
const { build, lane } = await promise;
await cleanupBuild(lane.runner.client, build, logger);
} catch {
// Best-effort
}
}),
);
}
}
}
/**
* Expand a source example stream into N copies, tagging each with `_iteration`.
* Round-robins scenarios across test cases and iter-interleaves per scenario
* so the in-flight set spans both dimensions. Concentration is handled by the
* work-stealing allocator at build time.
*/
async function* expandExamplesForIterations(
source: AsyncIterable<Example>,
iterations: number,
): AsyncIterable<Example> {
const cached: Example[] = [];
for await (const ex of source) cached.push(ex);
yield* expandWithIterations(
cached,
(ex) => (typeof ex.inputs?.testCaseFile === 'string' ? ex.inputs.testCaseFile : 'unknown'),
iterations,
(ex, i) => ({ ...ex, inputs: { ...ex.inputs, _iteration: i } }),
);
}
function filteredExamplesIterable(
lsClient: Client,
datasetName: string,
filter: string | undefined,
exclude: string | undefined,
logger: EvalLogger,
): AsyncIterable<Example> {
const slugs = loadWorkflowTestCasesWithFiles(filter, exclude).map((tc) => tc.fileSlug);
const labelParts: string[] = [];
if (filter) labelParts.push(`filter "${filter}"`);
if (exclude) labelParts.push(`exclude "${exclude}"`);
const label = labelParts.length > 0 ? labelParts.join(' + ') : 'Local test cases';
if (slugs.length === 0) {
logger.info(`${label} matched no local test case files`);
return (async function* () {})();
}
logger.info(`${label} matched ${String(slugs.length)} split(s): ${slugs.join(', ')}`);
return lsClient.listExamples({ datasetName, splits: slugs });
}
async function updateExperimentAggregates(config: {
lsClient: Client;
experimentName: string;
runs: Array<{ run: Run }>;
evaluation: MultiRunEvaluation;
buildDurations: Map<string, number>;
totalDurationMs: number;
logger: EvalLogger;
}): Promise<void> {
const { lsClient, experimentName, runs, evaluation, buildDurations, totalDurationMs, logger } =
config;
const buildTimes = [...buildDurations.values()];
const uniqueBuilds = buildTimes.length;
const avgBuildMs =
uniqueBuilds > 0 ? buildTimes.reduce((sum, d) => sum + d, 0) / uniqueBuilds : 0;
const execTimes = runs
.map(({ run }) => parseTargetOutput(run.outputs)?.execDurationMs)
.filter((ms): ms is number => typeof ms === 'number');
const avgExecMs =
execTimes.length > 0 ? execTimes.reduce((sum, d) => sum + d, 0) / execTimes.length : 0;
const aggregates = {
duration_s: Math.round(totalDurationMs / 100) / 10,
avg_build_s: Math.round(avgBuildMs / 100) / 10,
avg_exec_s: Math.round(avgExecMs / 100) / 10,
unique_builds: uniqueBuilds,
pass_rate_per_iter: computePassRatePerIter(evaluation),
};
try {
const project = await lsClient.readProject({ projectName: experimentName });
// `updateProject` replaces `extra` wholesale — preserve it so auto-set
// fields (splits, etc.) survive. Narrow via typeof guards rather than `as`.
const existingExtra = isPlainObject(project.extra) ? project.extra : {};
const existingMetadata = isPlainObject(existingExtra.metadata) ? existingExtra.metadata : {};
await lsClient.updateProject(project.id, {
projectExtra: existingExtra,
metadata: { ...existingMetadata, ...aggregates },
});
logger.verbose(`Updated experiment metadata: ${JSON.stringify(aggregates)}`);
} catch (error: unknown) {
const msg = error instanceof Error ? error.message : String(error);
logger.verbose(`Could not update experiment metadata: ${msg}`);
}
}
/**
* Attach per-example pass metrics (pass_rate, pass_at_k, pass_hat_k) as
* feedback on every run in the example's group. All N runs of the same example
* carry the same value — that lets the LangSmith UI sort/filter individual
* runs by their example's metric, and its per-experiment column aggregation
* reduces to the mean across unique examples.
*/
async function writePerRunPassMetrics(config: {
lsClient: Client;
runs: Array<{ run: Run }>;
logger: EvalLogger;
}): Promise<void> {
const { lsClient, runs, logger } = config;
// Group runs by reference_example_id, counting passes.
const byExample = new Map<string, { runIds: string[]; passed: number; total: number }>();
for (const { run } of runs) {
const exampleId = run.reference_example_id;
if (!exampleId) continue;
const output = parseTargetOutput(run.outputs);
if (!output) continue;
const entry = byExample.get(exampleId) ?? { runIds: [], passed: 0, total: 0 };
entry.runIds.push(run.id);
entry.total++;
if (output.passed) entry.passed++;
byExample.set(exampleId, entry);
}
// Individual writes are best-effort: a transient API error on one run
// shouldn't block the rest, so we swallow per-promise and keep going.
const feedbackWrites: Array<Promise<unknown>> = [];
for (const { runIds, passed, total } of byExample.values()) {
const passAtKValue = passAtK(total, passed, total);
const passHatKValue = passHatK(total, passed, total);
for (const runId of runIds) {
feedbackWrites.push(
lsClient.createFeedback(runId, 'pass_at_k', { score: passAtKValue }).catch(() => {}),
lsClient.createFeedback(runId, 'pass_hat_k', { score: passHatKValue }).catch(() => {}),
);
}
}
await Promise.all(feedbackWrites);
logger.verbose(
`Wrote pass metrics feedback for ${String(byExample.size)} example(s) across ${String(runs.length)} run(s)`,
);
}
/**
* Convert LangSmith's flat `Run[]` into the `WorkflowTestCaseResult[][]` shape
* the aggregator expects (outer: runs, inner: test cases). Groups by
* (testCaseFile, scenarioName), then reconstructs per-iteration test case
* results. Scenarios with no matching run get a build_failure stub.
*/
function reshapeLangSmithRuns(
rows: Array<{ run: Run }>,
testCasesWithFiles: WorkflowTestCaseWithFile[],
numIterations: number,
transcriptByThreadId: Map<string, TranscriptTurn[]>,
): WorkflowTestCaseResult[][] {
// Index runs by (iteration, testCaseFile, scenarioName) using the `_iteration`
// we injected in expandExamplesForIterations. Falls back to 0 for single-run.
const byKey = new Map<string, Run>();
for (const { run } of rows) {
const inputs = runInputsSchema.safeParse(run.inputs ?? {});
if (!inputs.success) continue;
const key = `${String(inputs.data._iteration)}/${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
byKey.set(key, run);
}
const allRunResults: WorkflowTestCaseResult[][] = [];
for (let iter = 0; iter < numIterations; iter++) {
const runResults: WorkflowTestCaseResult[] = [];
for (const { testCase, fileSlug } of testCasesWithFiles) {
const executionScenarioResults: ExecutionScenarioResult[] = [];
let workflowBuildSuccess = false;
let workflowId: string | undefined;
let buildError: string | undefined;
let threadId: string | undefined;
let workflowChecks: CheckOutcome[] | undefined;
for (const scenario of testCase.executionScenarios) {
const run = byKey.get(`${String(iter)}/${fileSlug}/${scenario.name}`);
const output = run ? parseTargetOutput(run.outputs) : undefined;
if (!run || !output) {
executionScenarioResults.push({
scenario,
success: false,
score: 0,
reasoning: run ? 'Malformed run output — skipped' : 'No run result for this scenario',
});
continue;
}
if (output.buildSuccess) workflowBuildSuccess = true;
if (output.workflowId) workflowId = output.workflowId;
if (output.threadId) threadId = output.threadId;
if (!output.buildSuccess && output.reasoning) buildError = output.reasoning;
if (output.workflowChecks && !workflowChecks) workflowChecks = output.workflowChecks;
executionScenarioResults.push({
scenario,
success: output.passed,
evalResult: output.evalResult,
score: output.score,
reasoning: output.reasoning,
failureCategory: output.failureCategory,
rootCause: output.rootCause,
});
}
const transcript = threadId ? transcriptByThreadId.get(threadId) : undefined;
runResults.push({
testCase,
workflowBuildSuccess,
workflowId,
executionScenarioResults,
buildError,
threadId,
transcript,
workflowChecks,
});
}
allRunResults.push(runResults);
}
return allRunResults;
}
// ---------------------------------------------------------------------------
// Direct mode: simple loop, no LangSmith dependency
// ---------------------------------------------------------------------------
async function runDirectLoop(config: RunConfig): Promise<MultiRunEvaluation> {
const { args, lanes, logger, prebuiltManifest } = config;
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter, args.exclude);
if (testCasesWithFiles.length === 0) {
console.log('No workflow test cases found in evaluations/data/workflows/');
return { totalRuns: 0, testCases: [] };
}
const totalScenarios = testCasesWithFiles.reduce(
(sum, { testCase }) => sum + testCase.executionScenarios.length,
0,
);
logger.info(
`Running ${String(testCasesWithFiles.length)} test case(s) with ${String(totalScenarios)} scenario(s) × ${String(args.iterations)} iteration(s) across ${String(lanes.length)} lane(s)`,
);
// Distribute test cases across lanes by source-order index. Each bucket carries
// the original index so we can re-sort lane outputs back to source order — the
// aggregator indexes per-iteration results positionally.
const indexed = testCasesWithFiles.map((tc, origIdx) => ({ tc, origIdx }));
const buckets = partitionRoundRobin(indexed, lanes.length);
// Iterations are independent — run them in parallel.
const allRunResults: WorkflowTestCaseResult[][] = await Promise.all(
Array.from({ length: args.iterations }, async (_unused, iter) => {
if (args.iterations > 1) {
logger.info(`--- Iteration #${String(iter + 1)}/${String(args.iterations)} starting ---`);
}
const laneResults = await Promise.all(
lanes.map(async (lane, laneIdx) => {
const bucket = buckets[laneIdx];
const laneTag =
lanes.length > 1 ? ` [lane ${String(laneIdx + 1)}/${String(lanes.length)}]` : '';
const results = await runWithConcurrency(
bucket,
async ({ tc }) =>
await runWorkflowTestCase({
client: lane.client,
testCase: tc.testCase,
timeoutMs: args.timeoutMs,
seededCredentialTypes: lane.seedResult.seededTypes,
preRunWorkflowIds: lane.preRunWorkflowIds,
claimedWorkflowIds: lane.claimedWorkflowIds,
logger,
keepWorkflows: args.keepWorkflows,
laneTag,
prebuiltWorkflowId: pickPrebuiltWorkflowId(prebuiltManifest, tc.fileSlug, iter),
}),
MAX_CONCURRENT_BUILDS,
);
return bucket.map((b, i) => ({ origIdx: b.origIdx, result: results[i] }));
}),
);
const flat = laneResults.flat();
flat.sort((a, b) => a.origIdx - b.origIdx);
return flat.map((x) => x.result);
}),
);
return aggregateResults(allRunResults, args.iterations);
}
// ---------------------------------------------------------------------------
// eval-results.json output (same shape as CI PR comment expects)
// ---------------------------------------------------------------------------
/**
* Flatten per-iteration runs into a single list of test-case results for the
* HTML report. Previously we rendered only `tc.runs[0]`, which silently hid
* iterations 2..N — a flaky scenario that passed once and failed twice would
* appear clean in the uploaded artifact. For multi-iteration runs we prefix
* the opening user turn with its iteration number so the cards are
* distinguishable at a glance.
*/
function flattenRunsForReport(evaluation: MultiRunEvaluation): WorkflowTestCaseResult[] {
if (evaluation.totalRuns <= 1) {
return evaluation.testCases.map((tc) => tc.runs[0]);
}
return evaluation.testCases.flatMap((tc) =>
tc.runs.map((run, iter) => {
const [opening, ...rest] = run.testCase.conversation;
return {
...run,
testCase: {
...run.testCase,
conversation: [
{
...opening,
text: `[iter ${String(iter + 1)}/${String(evaluation.totalRuns)}] ${opening.text}`,
},
...rest,
],
},
};
}),
);
}
interface AggregateMetrics {
/** Number of test cases with at least one successful build across iterations. */
built: number;
/** Total scenarios across all test cases. */
scenariosTotal: number;
/** Mean pass@k across scenarios at k = totalRuns (0..1). */
passAtK: number;
/** Mean pass^k across scenarios at k = totalRuns (0..1). */
passHatK: number;
/** Index into each scenario's passAtK/passHatK array for k = totalRuns. */
kIndex: number;
/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
passRatePerIter: string;
}
function computeAggregateMetrics(evaluation: MultiRunEvaluation): AggregateMetrics {
const { totalRuns, testCases } = evaluation;
const allScenarios = testCases.flatMap((tc) => tc.executionScenarios);
const total = allScenarios.length;
const kIndex = Math.max(totalRuns - 1, 0);
const built = testCases.filter((tc) => tc.buildSuccessCount > 0).length;
const passAtK =
total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passAtK[kIndex] ?? 0), 0) / total : 0;
const passHatK =
total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passHatK[kIndex] ?? 0), 0) / total : 0;
return {
built,
scenariosTotal: total,
passAtK,
passHatK,
kIndex,
passRatePerIter: computePassRatePerIter(evaluation),
};
}
/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
const { totalRuns, testCases } = evaluation;
const allScenarios = testCases.flatMap((tc) => tc.executionScenarios);
if (allScenarios.length === 0) return '';
const rates: string[] = [];
for (let i = 0; i < totalRuns; i++) {
const passed = allScenarios.filter((s) => s.runs[i]?.success).length;
rates.push(`${String(Math.round((passed / allScenarios.length) * 100))}%`);
}
return rates.join(' / ');
}
function writeEvalResults(
evaluation: MultiRunEvaluation,
duration: number,
outputDir: string | undefined,
experimentName: string | undefined,
outcome: ComparisonOutcome | undefined,
commitSha: string | undefined,
slugByTestCase: Map<WorkflowTestCase, string> | undefined,
): { jsonPath: string; prCommentPath: string } {
const { totalRuns, testCases } = evaluation;
const metrics = computeAggregateMetrics(evaluation);
const result = outcome?.kind === 'ok' ? outcome.result : undefined;
const checksSummary = aggregateWorkflowChecks(evaluation);
const report = {
timestamp: new Date().toISOString(),
duration,
totalRuns,
experimentName,
summary: {
testCases: testCases.length,
built: metrics.built,
scenariosTotal: metrics.scenariosTotal,
passAtK: metrics.passAtK,
passHatK: metrics.passHatK,
passRatePerIter: metrics.passRatePerIter,
...(checksSummary ? { workflowChecks: checksSummary } : {}),
},
// Structured comparison payload only — the rendered markdown lives in
// the sibling `eval-pr-comment.md` file so consumers can pick the format
// they want without re-running the eval. `comparisonStatus` records why
// the comparison was skipped when applicable, so JSON consumers can
// distinguish "no baseline yet" from "regression detection broke".
comparison: result
? {
baseline: result.baseline.experimentName,
result: serializeComparison(result),
}
: undefined,
comparisonStatus: outcome?.kind ?? 'not_attempted',
comparisonError: outcome?.kind === 'fetch_failed' ? outcome.error : undefined,
testCases: testCases.map((tc) => ({
name: tc.testCase.conversation[0].text.slice(0, 70),
buildSuccessCount: tc.buildSuccessCount,
totalRuns,
workflowChecksPerRun: tc.runs.map((run) =>
run.workflowChecks ? statusMap(run.workflowChecks) : null,
),
scenarios: tc.executionScenarios.map((sa) => ({
name: sa.scenario.name,
passCount: sa.passCount,
totalRuns,
passAtK: sa.passAtK[metrics.kIndex] ?? 0,
passHatK: sa.passHatK[metrics.kIndex] ?? 0,
runs: sa.runs.map((sr) => ({
passed: sr.success,
score: sr.score,
reasoning: sr.reasoning,
failureCategory: sr.failureCategory,
rootCause: sr.rootCause,
execErrors: sr.evalResult?.errors ?? [],
evalResult: sr.evalResult,
})),
})),
})),
};
const targetDir = outputDir ?? process.cwd();
mkdirSync(targetDir, { recursive: true });
const jsonPath = join(targetDir, 'eval-results.json');
writeFileSync(jsonPath, JSON.stringify(report, null, 2));
// Always write the rendered PR comment — the markdown formatter handles
// both with-comparison and no-baseline cases. CI consumes this file
// directly; local users get a copy-pasteable artifact.
const prCommentPath = join(targetDir, 'eval-pr-comment.md');
writeFileSync(
prCommentPath,
formatComparisonMarkdown(evaluation, outcome, { commitSha, slugByTestCase }),
);
return { jsonPath, prCommentPath };
}
/**
* Convert ComparisonResult into a JSON-serializable shape (Maps don't survive
* JSON.stringify by default).
*/
function serializeComparison(result: ComparisonResult): {
pr: { experimentName: string };
baseline: { experimentName: string };
aggregate: ComparisonResult['aggregate'];
scenarios: ComparisonResult['scenarios'];
prOnly: ComparisonResult['prOnly'];
baselineOnly: ComparisonResult['baselineOnly'];
failureCategories: ComparisonResult['failureCategories'];
} {
return {
pr: result.pr,
baseline: result.baseline,
aggregate: result.aggregate,
scenarios: result.scenarios,
prOnly: result.prOnly,
baselineOnly: result.baselineOnly,
failureCategories: result.failureCategories,
};
}
// ---------------------------------------------------------------------------
// Comparison vs the pinned baseline experiment
// ---------------------------------------------------------------------------
/**
* Best-effort comparison. Returns a tagged outcome so the PR comment can
* distinguish "no baseline yet" / "this run IS the baseline" from a real
* regression-detection outage (LangSmith down, fetch failure). Never throws
* — the eval run is not gated on the comparison.
*/
async function tryRunComparison(config: {
lsClient: Client;
prExperimentName: string;
evaluation: MultiRunEvaluation;
testCasesWithFiles: WorkflowTestCaseWithFile[];
logger: EvalLogger;
}): Promise<ComparisonOutcome> {
const { lsClient, prExperimentName, evaluation, testCasesWithFiles, logger } = config;
try {
const baselineName = await findLatestBaseline(lsClient);
if (!baselineName) {
logger.verbose(
'No baseline experiment found — skipping comparison. ' +
'Run with --experiment-name instance-ai-baseline to create one.',
);
return { kind: 'no_baseline' };
}
if (baselineName === prExperimentName) {
logger.verbose('Current run is the baseline — skipping comparison.');
return { kind: 'self_baseline', experimentName: baselineName };
}
logger.info(`Comparing against baseline: ${baselineName}`);
const baseline = await fetchBaselineBucket(lsClient, baselineName);
const pr = bucketFromEvaluation(evaluation, testCasesWithFiles, prExperimentName);
return { kind: 'ok', result: compareBuckets(pr, baseline) };
} catch (error: unknown) {
const msg = error instanceof Error ? error.message : String(error);
logger.warn(`Comparison vs baseline failed: ${msg}`);
return { kind: 'fetch_failed', error: msg };
}
}
/**
* Project the in-memory MultiRunEvaluation onto the bucket shape used by
* fetchBaselineBucket, keyed by `${fileSlug}/${scenarioName}`.
*
* Looks up `fileSlug` by test case reference rather than array index — the
* comparison key depends on getting the right slug, and zipping by index
* silently miscompares if anything ever reorders the aggregate.
*/
function bucketFromEvaluation(
evaluation: MultiRunEvaluation,
testCasesWithFiles: WorkflowTestCaseWithFile[],
experimentName: string,
): ExperimentBucket {
const slugByTestCase = new Map(
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
);
const scenarios = new Map<string, ScenarioCounts>();
const failureCategoryTotals: Record<string, number> = {};
let trialTotal = 0;
for (const tc of evaluation.testCases) {
const fileSlug = slugByTestCase.get(tc.testCase);
if (!fileSlug) {
throw new Error(
`bucketFromEvaluation: no fileSlug for test case "${tc.testCase.conversation[0].text.slice(0, 60)}"`,
);
}
const total = tc.runs.length;
for (const sa of tc.executionScenarios) {
const key = `${fileSlug}/${sa.scenario.name}`;
const failureCategories: Record<string, number> = {};
for (const sr of sa.runs) {
trialTotal++;
if (!sr.success && sr.failureCategory) {
failureCategories[sr.failureCategory] = (failureCategories[sr.failureCategory] ?? 0) + 1;
failureCategoryTotals[sr.failureCategory] =
(failureCategoryTotals[sr.failureCategory] ?? 0) + 1;
}
}
scenarios.set(key, {
testCaseFile: fileSlug,
scenarioName: sa.scenario.name,
passed: sa.passCount,
total,
failureCategories,
});
}
}
return { experimentName, scenarios, failureCategoryTotals, trialTotal };
}
main().catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});