mirror of
https://github.com/n8n-io/n8n.git
synced 2026-05-27 14:57:21 +02:00
1250 lines
43 KiB
JavaScript
1250 lines
43 KiB
JavaScript
#!/usr/bin/env node
|
||
// ---------------------------------------------------------------------------
|
||
// Instance AI workflow eval CLI
|
||
//
|
||
// Runs workflow execution evaluations. When LANGSMITH_API_KEY is set, uses
|
||
// LangSmith's evaluate() for experiment tracking and tracing. Otherwise
|
||
// falls back to a direct loop with the same eval-results.json output.
|
||
// ---------------------------------------------------------------------------
|
||
|
||
import type { InstanceAiEvalExecutionResult } from '@n8n/api-types';
|
||
import { mkdirSync, writeFileSync } from 'fs';
|
||
import { Client } from 'langsmith';
|
||
import { evaluate } from 'langsmith/evaluation';
|
||
import type { EvaluationResult } from 'langsmith/evaluation';
|
||
import type { Example, Run } from 'langsmith/schemas';
|
||
import { traceable } from 'langsmith/traceable';
|
||
import { join } from 'path';
|
||
import { z } from 'zod';
|
||
|
||
import { aggregateResults, passAtK, passHatK } from './aggregator';
|
||
import { parseCliArgs } from './args';
|
||
import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
|
||
import { LaneAllocator } from './lane-allocator';
|
||
import { expandWithIterations, partitionRoundRobin } from './lanes';
|
||
import { aggregateWorkflowChecks, statusMap } from '../binaryChecks/aggregate';
|
||
import { CHECK_DIMENSIONS, type CheckOutcome } from '../binaryChecks/types';
|
||
import { N8nClient } from '../clients/n8n-client';
|
||
import {
|
||
compareBuckets,
|
||
type ComparisonOutcome,
|
||
type ComparisonResult,
|
||
type ExperimentBucket,
|
||
type ScenarioCounts,
|
||
} from '../comparison/compare';
|
||
import { fetchBaselineBucket, findLatestBaseline } from '../comparison/fetch-baseline';
|
||
import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
|
||
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
|
||
import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
|
||
import type { WorkflowTestCaseWithFile } from '../data/workflows';
|
||
import { createLogger } from '../harness/logger';
|
||
import type { EvalLogger } from '../harness/logger';
|
||
import {
|
||
fetchPrebuiltBuild,
|
||
loadPrebuiltManifest,
|
||
pickPrebuiltWorkflowId,
|
||
type PrebuiltManifest,
|
||
} from '../harness/prebuilt-workflows';
|
||
import {
|
||
buildWorkflow,
|
||
executeScenario,
|
||
cleanupBuild,
|
||
runWorkflowChecks,
|
||
runWorkflowTestCase,
|
||
runWithConcurrency,
|
||
type BuildResult,
|
||
} from '../harness/runner';
|
||
import { syncDataset, type DatasetExampleInputs } from '../langsmith/dataset-sync';
|
||
import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
|
||
import { writeWorkflowReport } from '../report/workflow-report';
|
||
import type {
|
||
MultiRunEvaluation,
|
||
ExecutionScenarioResult,
|
||
ExecutionScenario,
|
||
TranscriptTurn,
|
||
WorkflowTestCase,
|
||
WorkflowTestCaseResult,
|
||
} from '../types';
|
||
|
||
// n8n degrades above ~4 concurrent builds.
|
||
const MAX_CONCURRENT_BUILDS = 4;
|
||
|
||
const checkOutcomeSchema = z.object({
|
||
name: z.string(),
|
||
description: z.string(),
|
||
kind: z.enum(['deterministic', 'llm']),
|
||
dimension: z.enum(CHECK_DIMENSIONS),
|
||
status: z.enum(['pass', 'fail', 'n_a']),
|
||
comment: z.string().optional(),
|
||
});
|
||
|
||
const targetOutputSchema = z.object({
|
||
buildSuccess: z.boolean().default(false),
|
||
passed: z.boolean().default(false),
|
||
score: z.number().default(0),
|
||
reasoning: z.string().default(''),
|
||
workflowId: z.string().optional(),
|
||
failureCategory: z.string().optional(),
|
||
rootCause: z.string().optional(),
|
||
execErrors: z.array(z.string()).default([]),
|
||
evalResult: z.unknown().optional(),
|
||
/** Only set on the scenario that initiated the build. */
|
||
buildDurationMs: z.number().optional(),
|
||
execDurationMs: z.number().default(0),
|
||
nodeCount: z.number().default(0),
|
||
/** The thread id used during the build — keys the LangSmith trace lookup. */
|
||
threadId: z.string().optional(),
|
||
workflowChecks: z.array(checkOutcomeSchema).optional(),
|
||
});
|
||
|
||
type TargetOutput = Omit<z.infer<typeof targetOutputSchema>, 'evalResult'> & {
|
||
evalResult?: InstanceAiEvalExecutionResult;
|
||
};
|
||
|
||
function isPlainObject(v: unknown): v is Record<string, unknown> {
|
||
return typeof v === 'object' && v !== null && !Array.isArray(v);
|
||
}
|
||
|
||
function isEvalResult(v: unknown): v is InstanceAiEvalExecutionResult {
|
||
if (!isPlainObject(v)) return false;
|
||
return (
|
||
typeof v.nodeResults === 'object' &&
|
||
v.nodeResults !== null &&
|
||
Array.isArray(v.errors) &&
|
||
typeof v.hints === 'object' &&
|
||
v.hints !== null
|
||
);
|
||
}
|
||
|
||
/** Safe-parse a run's outputs. Returns `undefined` if the row is malformed
|
||
* or missing, so callers can skip it instead of treating it as a genuine
|
||
* failed evaluation. Every field in the schema has a default, so an empty
|
||
* or nullish raw value would otherwise parse successfully into a "failed"
|
||
* shape (passed:false, score:0) — masking infra errors as builder regressions.
|
||
*/
|
||
function parseTargetOutput(raw: unknown): TargetOutput | undefined {
|
||
if (!isPlainObject(raw) || Object.keys(raw).length === 0) return undefined;
|
||
const parsed = targetOutputSchema.safeParse(raw);
|
||
if (!parsed.success) return undefined;
|
||
return {
|
||
...parsed.data,
|
||
evalResult: isEvalResult(parsed.data.evalResult) ? parsed.data.evalResult : undefined,
|
||
};
|
||
}
|
||
|
||
const runInputsSchema = z
|
||
.object({
|
||
testCaseFile: z.string().default(''),
|
||
scenarioName: z.string().default(''),
|
||
/** 0-based iteration index; injected during multi-run expansion. */
|
||
_iteration: z.number().int().nonnegative().default(0),
|
||
})
|
||
.passthrough();
|
||
|
||
/** Target input shape with the iteration index we inject for multi-run. */
|
||
type TargetInputs = DatasetExampleInputs & { _iteration?: number };
|
||
|
||
interface Lane {
|
||
client: N8nClient;
|
||
preRunWorkflowIds: Set<string>;
|
||
claimedWorkflowIds: Set<string>;
|
||
seedResult: { seededTypes: string[]; credentialIds: string[] };
|
||
}
|
||
|
||
interface RunConfig {
|
||
args: ReturnType<typeof parseCliArgs>;
|
||
lanes: Lane[];
|
||
logger: EvalLogger;
|
||
prebuiltManifest?: PrebuiltManifest;
|
||
}
|
||
|
||
async function main(): Promise<void> {
|
||
const args = parseCliArgs(process.argv.slice(2));
|
||
const logger = createLogger(args.verbose);
|
||
|
||
const prebuiltManifest = args.prebuiltWorkflows
|
||
? loadPrebuiltManifest(args.prebuiltWorkflows)
|
||
: undefined;
|
||
if (prebuiltManifest) {
|
||
// Multi-lane is for distributing the orchestrator build phase across
|
||
// n8n instances. Prebuilt workflows live on a single instance — fetching
|
||
// them from any other lane's URL would 404 — and prebuilt mode skips
|
||
// builds anyway, so multi-lane buys nothing. Refuse the combination
|
||
// rather than silently fetching from one lane and ignoring the rest.
|
||
if (args.baseUrls.length > 1) {
|
||
throw new Error(
|
||
'--prebuilt-workflows is incompatible with multiple --base-url values. Prebuilt workflows live on a single n8n instance; pass exactly one --base-url.',
|
||
);
|
||
}
|
||
const slugCount = Object.keys(prebuiltManifest).length;
|
||
logger.info(`Loaded prebuilt manifest: ${String(slugCount)} test case(s)`);
|
||
|
||
// Warn on slugs that don't match a local test-case file. Common cause
|
||
// is typos in the manifest — without this check, the typo silently
|
||
// falls through to an orchestrator build (or no run at all), and the
|
||
// user thinks the prebuilt path ran when it didn't.
|
||
const localSlugs = new Set(loadWorkflowTestCasesWithFiles().map((tc) => tc.fileSlug));
|
||
const orphanSlugs = Object.keys(prebuiltManifest).filter((slug) => !localSlugs.has(slug));
|
||
if (orphanSlugs.length > 0) {
|
||
logger.warn(
|
||
`Prebuilt manifest references ${String(orphanSlugs.length)} slug(s) with no matching local test case (will be ignored): ${orphanSlugs.join(', ')}`,
|
||
);
|
||
}
|
||
}
|
||
|
||
// One lane per base URL. The LangSmith path then uses a work-stealing
|
||
// allocator (lane-allocator.ts) to dispatch builds across lanes; the direct
|
||
// path partitions test cases statically per lane.
|
||
const lanes: Lane[] = await Promise.all(
|
||
args.baseUrls.map(async (baseUrl, idx) => {
|
||
const tag =
|
||
args.baseUrls.length > 1
|
||
? ` [lane ${String(idx + 1)}/${String(args.baseUrls.length)}]`
|
||
: '';
|
||
const client = new N8nClient(baseUrl);
|
||
logger.info(`Authenticating with ${baseUrl}...${tag}`);
|
||
await client.login(args.email, args.password);
|
||
logger.success(`Authenticated${tag}`);
|
||
|
||
logger.info(`Seeding credentials...${tag}`);
|
||
const seedResult = await seedCredentials(client, undefined, logger);
|
||
logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)${tag}`);
|
||
|
||
const preRunWorkflowIds = await snapshotWorkflowIds(client);
|
||
const claimedWorkflowIds = new Set<string>();
|
||
return { client, preRunWorkflowIds, claimedWorkflowIds, seedResult };
|
||
}),
|
||
);
|
||
|
||
const startTime = Date.now();
|
||
|
||
try {
|
||
const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);
|
||
|
||
let evaluation: MultiRunEvaluation;
|
||
let experimentName: string | undefined;
|
||
let outcome: ComparisonOutcome | undefined;
|
||
let slugByTestCase: Map<WorkflowTestCase, string> | undefined;
|
||
|
||
if (hasLangSmith) {
|
||
logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
|
||
const langsmithRun = await runWithLangSmith({ args, lanes, logger, prebuiltManifest });
|
||
evaluation = langsmithRun.evaluation;
|
||
experimentName = langsmithRun.experimentName;
|
||
outcome = langsmithRun.outcome;
|
||
slugByTestCase = langsmithRun.slugByTestCase;
|
||
} else {
|
||
logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
|
||
evaluation = await runDirectLoop({ args, lanes, logger, prebuiltManifest });
|
||
}
|
||
|
||
const totalDuration = Date.now() - startTime;
|
||
const commitSha = process.env.LANGSMITH_REVISION_ID ?? process.env.GITHUB_SHA;
|
||
const { jsonPath, prCommentPath } = writeEvalResults(
|
||
evaluation,
|
||
totalDuration,
|
||
args.outputDir,
|
||
experimentName,
|
||
outcome,
|
||
commitSha,
|
||
slugByTestCase,
|
||
);
|
||
console.log(`Results: ${jsonPath}`);
|
||
console.log(`PR comment: ${prCommentPath}`);
|
||
const reportResults = flattenRunsForReport(evaluation);
|
||
const htmlPath = writeWorkflowReport(reportResults);
|
||
console.log(`Report: ${htmlPath}`);
|
||
console.log(
|
||
'\n' + formatComparisonTerminal(evaluation, outcome, { commitSha, slugByTestCase }),
|
||
);
|
||
} finally {
|
||
await Promise.all(
|
||
lanes.map(async (lane) => {
|
||
await cleanupCredentials(lane.client, lane.seedResult.credentialIds).catch(() => {});
|
||
}),
|
||
);
|
||
}
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// LangSmith mode: evaluate() with dataset sync, tracing, experiments
|
||
// ---------------------------------------------------------------------------
|
||
|
||
async function runWithLangSmith(config: RunConfig): Promise<{
|
||
evaluation: MultiRunEvaluation;
|
||
experimentName: string;
|
||
outcome: ComparisonOutcome;
|
||
slugByTestCase: Map<WorkflowTestCase, string>;
|
||
}> {
|
||
const { args, lanes, logger, prebuiltManifest } = config;
|
||
|
||
const lsClient = new Client();
|
||
const datasetName = await syncDataset(lsClient, args.dataset, logger, args.filter, args.exclude);
|
||
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter, args.exclude);
|
||
|
||
// Stash transcripts by threadId so reshapeLangSmithRuns can merge them in —
|
||
// the LangSmith target() output schema doesn't carry the full transcript.
|
||
const transcriptByThreadId = new Map<string, TranscriptTurn[]>();
|
||
|
||
// LangSmith dataset rows carry only per-scenario fields. The conversation
|
||
// for the build is sourced locally, keyed by fileSlug.
|
||
const conversationByFileSlug = new Map<
|
||
string,
|
||
{ conversation: WorkflowTestCase['conversation']; messageBudget?: number }
|
||
>();
|
||
for (const { testCase, fileSlug } of testCasesWithFiles) {
|
||
conversationByFileSlug.set(fileSlug, {
|
||
conversation: testCase.conversation,
|
||
messageBudget: testCase.messageBudget,
|
||
});
|
||
}
|
||
|
||
// LaneState carries the allocator-managed counters (activeBuilds,
|
||
// inflightKeys) plus the lane's traced LangSmith wrappers. `runner` is
|
||
// the underlying Lane (n8n client, credential state) — named distinctly so
|
||
// it doesn't shadow the iteration variable `lane` in lanes.map().
|
||
interface LaneState {
|
||
runner: Lane;
|
||
activeBuilds: number;
|
||
inflightKeys: Set<string>;
|
||
tracedBuild: (buildArgs: {
|
||
conversation: WorkflowTestCase['conversation'];
|
||
messageBudget?: number;
|
||
}) => Promise<BuildResult>;
|
||
tracedExecute: (execArgs: {
|
||
workflowId: string;
|
||
scenario: ExecutionScenario;
|
||
workflowJsons: BuildResult['workflowJsons'];
|
||
}) => Promise<Awaited<ReturnType<typeof executeScenario>>>;
|
||
}
|
||
|
||
const laneStates: LaneState[] = lanes.map((lane, idx) => {
|
||
const laneNum = idx + 1;
|
||
const laneTag = lanes.length > 1 ? ` [lane ${String(laneNum)}/${String(lanes.length)}]` : '';
|
||
return {
|
||
runner: lane,
|
||
activeBuilds: 0,
|
||
inflightKeys: new Set<string>(),
|
||
tracedBuild: traceable(
|
||
async (buildArgs: {
|
||
conversation: WorkflowTestCase['conversation'];
|
||
messageBudget?: number;
|
||
}) =>
|
||
await buildWorkflow({
|
||
client: lane.client,
|
||
conversation: buildArgs.conversation,
|
||
messageBudget: buildArgs.messageBudget,
|
||
timeoutMs: args.timeoutMs,
|
||
preRunWorkflowIds: lane.preRunWorkflowIds,
|
||
claimedWorkflowIds: lane.claimedWorkflowIds,
|
||
logger,
|
||
laneTag,
|
||
}),
|
||
{
|
||
name: 'workflow_build',
|
||
run_type: 'chain',
|
||
client: lsClient,
|
||
metadata: { lane: laneNum },
|
||
},
|
||
),
|
||
tracedExecute: traceable(
|
||
async (execArgs: {
|
||
workflowId: string;
|
||
scenario: ExecutionScenario;
|
||
workflowJsons: BuildResult['workflowJsons'];
|
||
}) =>
|
||
await executeScenario(
|
||
lane.client,
|
||
execArgs.workflowId,
|
||
execArgs.scenario,
|
||
execArgs.workflowJsons,
|
||
logger,
|
||
args.timeoutMs,
|
||
),
|
||
{
|
||
name: 'scenario_execution',
|
||
run_type: 'chain',
|
||
client: lsClient,
|
||
metadata: { lane: laneNum },
|
||
},
|
||
),
|
||
};
|
||
});
|
||
|
||
// Work-stealing: each build acquires a lane that isn't already running its
|
||
// fileSlug, runs there (capped per-lane), then releases. Scenarios re-use the
|
||
// lane that built their workflow.
|
||
const allocator = new LaneAllocator(laneStates, MAX_CONCURRENT_BUILDS);
|
||
const buildCache = new Map<
|
||
string,
|
||
Promise<{ build: BuildResult; lane: LaneState; buildDurationMs: number }>
|
||
>();
|
||
const buildDurations = new Map<string, number>();
|
||
|
||
async function getOrBuild(
|
||
iteration: number,
|
||
fileSlug: string,
|
||
): Promise<{ build: BuildResult; lane: LaneState; buildDurationMs: number }> {
|
||
// Cache key on (iteration, fileSlug) — every scenario in a test-case file
|
||
// shares this build, and prebuilt + orchestrator-built paths use the same key.
|
||
const key = `${String(iteration)}:${fileSlug}`;
|
||
const existing = buildCache.get(key);
|
||
if (existing) return await existing;
|
||
const promise = (async () => {
|
||
const prebuiltId = pickPrebuiltWorkflowId(prebuiltManifest, fileSlug, iteration);
|
||
if (prebuiltId !== undefined) {
|
||
// Prebuilt path: no orchestrator concurrency to manage — just
|
||
// fetch the workflow. main() rejects multi-lane + prebuilt at
|
||
// startup, so laneStates always has exactly one entry here.
|
||
const lane = laneStates[0];
|
||
const start = Date.now();
|
||
const build = await fetchPrebuiltBuild(lane.runner.client, prebuiltId, logger);
|
||
const buildDurationMs = Date.now() - start;
|
||
buildDurations.set(key, buildDurationMs);
|
||
stashTranscript(build);
|
||
if (build.success && !build.workflowChecks) {
|
||
// No transcript in prebuilt mode — checks run with empty prompt context.
|
||
build.workflowChecks = await runWorkflowChecks({
|
||
workflow: build.workflowJsons[0],
|
||
prompt: '',
|
||
agentText: undefined,
|
||
logger,
|
||
});
|
||
}
|
||
return { build, lane, buildDurationMs };
|
||
}
|
||
// Orchestrator path: allocator spreads distinct fileSlugs across lanes;
|
||
// the build cache dedupes scenarios within one file.
|
||
const lane = await allocator.acquire(fileSlug);
|
||
const entry = conversationByFileSlug.get(fileSlug);
|
||
if (!entry) throw new Error(`No conversation found for fileSlug=${fileSlug}`);
|
||
try {
|
||
const start = Date.now();
|
||
const build = await lane.tracedBuild({
|
||
conversation: entry.conversation,
|
||
messageBudget: entry.messageBudget,
|
||
});
|
||
const buildDurationMs = Date.now() - start;
|
||
buildDurations.set(key, buildDurationMs);
|
||
stashTranscript(build);
|
||
return { build, lane, buildDurationMs };
|
||
} finally {
|
||
allocator.release(lane, fileSlug);
|
||
}
|
||
})();
|
||
buildCache.set(key, promise);
|
||
return await promise;
|
||
}
|
||
|
||
function stashTranscript(build: BuildResult): void {
|
||
if (build.threadId && build.transcript) {
|
||
transcriptByThreadId.set(build.threadId, build.transcript);
|
||
}
|
||
}
|
||
|
||
const target = async (inputs: TargetInputs): Promise<TargetOutput> => {
|
||
const iteration = inputs._iteration ?? 0;
|
||
const scenario: ExecutionScenario = {
|
||
name: inputs.scenarioName,
|
||
description: inputs.scenarioDescription,
|
||
dataSetup: inputs.dataSetup,
|
||
successCriteria: inputs.successCriteria,
|
||
};
|
||
|
||
const {
|
||
build,
|
||
lane: builtOnLane,
|
||
buildDurationMs,
|
||
} = await getOrBuild(iteration, inputs.testCaseFile);
|
||
|
||
if (!build.success || !build.workflowId) {
|
||
return {
|
||
buildSuccess: false,
|
||
passed: false,
|
||
score: 0,
|
||
reasoning: `Build failed: ${build.error ?? 'unknown'}`,
|
||
failureCategory: 'build_failure',
|
||
execErrors: build.error ? [build.error] : [],
|
||
buildDurationMs,
|
||
execDurationMs: 0,
|
||
nodeCount: 0,
|
||
threadId: build.threadId,
|
||
workflowChecks: build.workflowChecks,
|
||
};
|
||
}
|
||
|
||
const execStart = Date.now();
|
||
const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
|
||
let result;
|
||
try {
|
||
result = await builtOnLane.tracedExecute({
|
||
workflowId: build.workflowId,
|
||
scenario,
|
||
workflowJsons: build.workflowJsons,
|
||
});
|
||
} catch (error: unknown) {
|
||
// Mirror direct mode's per-scenario guard — without this, n8n API errors
|
||
// or verifier timeouts from executeWithLlmMock / verifyChecklist would
|
||
// escape to LangSmith, come back as a Run with null outputs, and be
|
||
// misclassified as builder regressions by the feedback extractor.
|
||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
|
||
return {
|
||
buildSuccess: true,
|
||
workflowId: build.workflowId,
|
||
passed: false,
|
||
score: 0,
|
||
reasoning: `Scenario execution error: ${errorMessage}`,
|
||
failureCategory: 'framework_issue',
|
||
execErrors: [errorMessage],
|
||
buildDurationMs,
|
||
execDurationMs: Date.now() - execStart,
|
||
nodeCount,
|
||
workflowChecks: build.workflowChecks,
|
||
};
|
||
}
|
||
const execDurationMs = Date.now() - execStart;
|
||
|
||
// Strip failure fields on pass: the verifier sometimes returns "."
|
||
// placeholders instead of omitting them.
|
||
const failureCategory = result.success ? undefined : result.failureCategory;
|
||
const rootCause = result.success ? undefined : result.rootCause;
|
||
|
||
return {
|
||
buildSuccess: true,
|
||
workflowId: build.workflowId,
|
||
passed: result.success,
|
||
score: result.score,
|
||
reasoning: result.reasoning,
|
||
failureCategory,
|
||
rootCause,
|
||
execErrors: result.evalResult?.errors ?? [],
|
||
evalResult: result.evalResult,
|
||
buildDurationMs,
|
||
execDurationMs,
|
||
nodeCount,
|
||
threadId: build.threadId,
|
||
workflowChecks: build.workflowChecks,
|
||
};
|
||
};
|
||
|
||
const feedbackExtractor = ({ run }: { run: Run }): EvaluationResult[] => {
|
||
const output = parseTargetOutput(run.outputs);
|
||
if (!output) return [];
|
||
// 'none' for passed scenarios so the column shows a full categorical
|
||
// breakdown instead of blank cells.
|
||
const failureCategory = output.passed ? 'none' : (output.failureCategory ?? 'unknown');
|
||
const feedback: EvaluationResult[] = [
|
||
{
|
||
key: 'scenario_pass',
|
||
score: output.score,
|
||
comment: output.reasoning || undefined,
|
||
},
|
||
{
|
||
key: 'failure_category',
|
||
value: failureCategory,
|
||
},
|
||
{
|
||
key: 'exec_duration_s',
|
||
score: output.execDurationMs / 1000,
|
||
},
|
||
{
|
||
key: 'node_count',
|
||
score: output.nodeCount,
|
||
},
|
||
];
|
||
if (output.buildDurationMs !== undefined) {
|
||
feedback.push({ key: 'build_duration_s', score: output.buildDurationMs / 1000 });
|
||
}
|
||
// Skip N/A so LangSmith column averages reduce to per-check pass-rate.
|
||
if (output.workflowChecks) {
|
||
for (const outcome of output.workflowChecks) {
|
||
if (outcome.status === 'n_a') continue;
|
||
feedback.push({
|
||
key: `evals.workflows.${outcome.dimension}.${outcome.name}`,
|
||
score: outcome.status === 'pass' ? 1 : 0,
|
||
comment: outcome.comment ?? undefined,
|
||
});
|
||
}
|
||
}
|
||
return feedback;
|
||
};
|
||
|
||
const experimentPrefix = args.experimentName ?? computeExperimentPrefix();
|
||
|
||
logger.info(
|
||
`Starting evaluate() with concurrency=${String(args.concurrency)}, ${String(lanes.length)} lane(s) × ${String(MAX_CONCURRENT_BUILDS)} concurrent builds, iterations=${String(args.iterations)}`,
|
||
);
|
||
|
||
// Always filter the LangSmith dataset by the local file slugs. The local
|
||
// JSON files are the source of truth; the dataset accumulates orphans (the
|
||
// sync is additive — see langsmith/dataset-sync.ts) and we don't want to
|
||
// run scenarios whose JSON file no longer exists.
|
||
const sourceExamples = filteredExamplesIterable(
|
||
lsClient,
|
||
datasetName,
|
||
args.filter,
|
||
args.exclude,
|
||
logger,
|
||
);
|
||
const evaluateData =
|
||
args.iterations > 1
|
||
? expandExamplesForIterations(sourceExamples, args.iterations)
|
||
: sourceExamples;
|
||
|
||
try {
|
||
const evaluateStart = Date.now();
|
||
const experimentResults = await evaluate(target, {
|
||
data: evaluateData,
|
||
evaluators: [feedbackExtractor],
|
||
experimentPrefix,
|
||
maxConcurrency: args.concurrency,
|
||
client: lsClient,
|
||
metadata: {
|
||
filter: args.filter ?? 'all',
|
||
exclude: args.exclude ?? null,
|
||
prebuilt: prebuiltManifest !== undefined,
|
||
concurrency: args.concurrency,
|
||
maxBuilds: MAX_CONCURRENT_BUILDS,
|
||
lanes: lanes.length,
|
||
iterations: args.iterations,
|
||
...buildCIMetadata(),
|
||
},
|
||
});
|
||
const totalDurationMs = Date.now() - evaluateStart;
|
||
|
||
logger.info(`Experiment: ${experimentResults.experimentName}`);
|
||
await lsClient.awaitPendingTraceBatches();
|
||
|
||
const allRunResults = reshapeLangSmithRuns(
|
||
experimentResults.results,
|
||
testCasesWithFiles,
|
||
args.iterations,
|
||
transcriptByThreadId,
|
||
);
|
||
const evaluation = aggregateResults(allRunResults, args.iterations);
|
||
|
||
await updateExperimentAggregates({
|
||
lsClient,
|
||
experimentName: experimentResults.experimentName,
|
||
runs: experimentResults.results,
|
||
evaluation,
|
||
buildDurations,
|
||
totalDurationMs,
|
||
logger,
|
||
});
|
||
|
||
await writePerRunPassMetrics({
|
||
lsClient,
|
||
runs: experimentResults.results,
|
||
logger,
|
||
});
|
||
|
||
const outcome = await tryRunComparison({
|
||
lsClient,
|
||
prExperimentName: experimentResults.experimentName,
|
||
evaluation,
|
||
testCasesWithFiles,
|
||
logger,
|
||
});
|
||
|
||
const slugByTestCase = new Map<WorkflowTestCase, string>(
|
||
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
|
||
);
|
||
|
||
return {
|
||
evaluation,
|
||
experimentName: experimentResults.experimentName,
|
||
outcome,
|
||
slugByTestCase,
|
||
};
|
||
} finally {
|
||
if (!args.keepWorkflows) {
|
||
await Promise.all(
|
||
[...buildCache.values()].map(async (promise) => {
|
||
try {
|
||
const { build, lane } = await promise;
|
||
await cleanupBuild(lane.runner.client, build, logger);
|
||
} catch {
|
||
// Best-effort
|
||
}
|
||
}),
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Expand a source example stream into N copies, tagging each with `_iteration`.
|
||
* Round-robins scenarios across test cases and iter-interleaves per scenario
|
||
* so the in-flight set spans both dimensions. Concentration is handled by the
|
||
* work-stealing allocator at build time.
|
||
*/
|
||
async function* expandExamplesForIterations(
|
||
source: AsyncIterable<Example>,
|
||
iterations: number,
|
||
): AsyncIterable<Example> {
|
||
const cached: Example[] = [];
|
||
for await (const ex of source) cached.push(ex);
|
||
yield* expandWithIterations(
|
||
cached,
|
||
(ex) => (typeof ex.inputs?.testCaseFile === 'string' ? ex.inputs.testCaseFile : 'unknown'),
|
||
iterations,
|
||
(ex, i) => ({ ...ex, inputs: { ...ex.inputs, _iteration: i } }),
|
||
);
|
||
}
|
||
|
||
function filteredExamplesIterable(
|
||
lsClient: Client,
|
||
datasetName: string,
|
||
filter: string | undefined,
|
||
exclude: string | undefined,
|
||
logger: EvalLogger,
|
||
): AsyncIterable<Example> {
|
||
const slugs = loadWorkflowTestCasesWithFiles(filter, exclude).map((tc) => tc.fileSlug);
|
||
const labelParts: string[] = [];
|
||
if (filter) labelParts.push(`filter "${filter}"`);
|
||
if (exclude) labelParts.push(`exclude "${exclude}"`);
|
||
const label = labelParts.length > 0 ? labelParts.join(' + ') : 'Local test cases';
|
||
if (slugs.length === 0) {
|
||
logger.info(`${label} matched no local test case files`);
|
||
return (async function* () {})();
|
||
}
|
||
logger.info(`${label} matched ${String(slugs.length)} split(s): ${slugs.join(', ')}`);
|
||
return lsClient.listExamples({ datasetName, splits: slugs });
|
||
}
|
||
|
||
async function updateExperimentAggregates(config: {
|
||
lsClient: Client;
|
||
experimentName: string;
|
||
runs: Array<{ run: Run }>;
|
||
evaluation: MultiRunEvaluation;
|
||
buildDurations: Map<string, number>;
|
||
totalDurationMs: number;
|
||
logger: EvalLogger;
|
||
}): Promise<void> {
|
||
const { lsClient, experimentName, runs, evaluation, buildDurations, totalDurationMs, logger } =
|
||
config;
|
||
|
||
const buildTimes = [...buildDurations.values()];
|
||
const uniqueBuilds = buildTimes.length;
|
||
const avgBuildMs =
|
||
uniqueBuilds > 0 ? buildTimes.reduce((sum, d) => sum + d, 0) / uniqueBuilds : 0;
|
||
|
||
const execTimes = runs
|
||
.map(({ run }) => parseTargetOutput(run.outputs)?.execDurationMs)
|
||
.filter((ms): ms is number => typeof ms === 'number');
|
||
const avgExecMs =
|
||
execTimes.length > 0 ? execTimes.reduce((sum, d) => sum + d, 0) / execTimes.length : 0;
|
||
|
||
const aggregates = {
|
||
duration_s: Math.round(totalDurationMs / 100) / 10,
|
||
avg_build_s: Math.round(avgBuildMs / 100) / 10,
|
||
avg_exec_s: Math.round(avgExecMs / 100) / 10,
|
||
unique_builds: uniqueBuilds,
|
||
pass_rate_per_iter: computePassRatePerIter(evaluation),
|
||
};
|
||
|
||
try {
|
||
const project = await lsClient.readProject({ projectName: experimentName });
|
||
// `updateProject` replaces `extra` wholesale — preserve it so auto-set
|
||
// fields (splits, etc.) survive. Narrow via typeof guards rather than `as`.
|
||
const existingExtra = isPlainObject(project.extra) ? project.extra : {};
|
||
const existingMetadata = isPlainObject(existingExtra.metadata) ? existingExtra.metadata : {};
|
||
await lsClient.updateProject(project.id, {
|
||
projectExtra: existingExtra,
|
||
metadata: { ...existingMetadata, ...aggregates },
|
||
});
|
||
logger.verbose(`Updated experiment metadata: ${JSON.stringify(aggregates)}`);
|
||
} catch (error: unknown) {
|
||
const msg = error instanceof Error ? error.message : String(error);
|
||
logger.verbose(`Could not update experiment metadata: ${msg}`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Attach per-example pass metrics (pass_rate, pass_at_k, pass_hat_k) as
|
||
* feedback on every run in the example's group. All N runs of the same example
|
||
* carry the same value — that lets the LangSmith UI sort/filter individual
|
||
* runs by their example's metric, and its per-experiment column aggregation
|
||
* reduces to the mean across unique examples.
|
||
*/
|
||
async function writePerRunPassMetrics(config: {
|
||
lsClient: Client;
|
||
runs: Array<{ run: Run }>;
|
||
logger: EvalLogger;
|
||
}): Promise<void> {
|
||
const { lsClient, runs, logger } = config;
|
||
|
||
// Group runs by reference_example_id, counting passes.
|
||
const byExample = new Map<string, { runIds: string[]; passed: number; total: number }>();
|
||
for (const { run } of runs) {
|
||
const exampleId = run.reference_example_id;
|
||
if (!exampleId) continue;
|
||
const output = parseTargetOutput(run.outputs);
|
||
if (!output) continue;
|
||
const entry = byExample.get(exampleId) ?? { runIds: [], passed: 0, total: 0 };
|
||
entry.runIds.push(run.id);
|
||
entry.total++;
|
||
if (output.passed) entry.passed++;
|
||
byExample.set(exampleId, entry);
|
||
}
|
||
|
||
// Individual writes are best-effort: a transient API error on one run
|
||
// shouldn't block the rest, so we swallow per-promise and keep going.
|
||
const feedbackWrites: Array<Promise<unknown>> = [];
|
||
for (const { runIds, passed, total } of byExample.values()) {
|
||
const passAtKValue = passAtK(total, passed, total);
|
||
const passHatKValue = passHatK(total, passed, total);
|
||
for (const runId of runIds) {
|
||
feedbackWrites.push(
|
||
lsClient.createFeedback(runId, 'pass_at_k', { score: passAtKValue }).catch(() => {}),
|
||
lsClient.createFeedback(runId, 'pass_hat_k', { score: passHatKValue }).catch(() => {}),
|
||
);
|
||
}
|
||
}
|
||
|
||
await Promise.all(feedbackWrites);
|
||
logger.verbose(
|
||
`Wrote pass metrics feedback for ${String(byExample.size)} example(s) across ${String(runs.length)} run(s)`,
|
||
);
|
||
}
|
||
|
||
/**
|
||
* Convert LangSmith's flat `Run[]` into the `WorkflowTestCaseResult[][]` shape
|
||
* the aggregator expects (outer: runs, inner: test cases). Groups by
|
||
* (testCaseFile, scenarioName), then reconstructs per-iteration test case
|
||
* results. Scenarios with no matching run get a build_failure stub.
|
||
*/
|
||
function reshapeLangSmithRuns(
|
||
rows: Array<{ run: Run }>,
|
||
testCasesWithFiles: WorkflowTestCaseWithFile[],
|
||
numIterations: number,
|
||
transcriptByThreadId: Map<string, TranscriptTurn[]>,
|
||
): WorkflowTestCaseResult[][] {
|
||
// Index runs by (iteration, testCaseFile, scenarioName) using the `_iteration`
|
||
// we injected in expandExamplesForIterations. Falls back to 0 for single-run.
|
||
const byKey = new Map<string, Run>();
|
||
for (const { run } of rows) {
|
||
const inputs = runInputsSchema.safeParse(run.inputs ?? {});
|
||
if (!inputs.success) continue;
|
||
const key = `${String(inputs.data._iteration)}/${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
|
||
byKey.set(key, run);
|
||
}
|
||
|
||
const allRunResults: WorkflowTestCaseResult[][] = [];
|
||
for (let iter = 0; iter < numIterations; iter++) {
|
||
const runResults: WorkflowTestCaseResult[] = [];
|
||
for (const { testCase, fileSlug } of testCasesWithFiles) {
|
||
const executionScenarioResults: ExecutionScenarioResult[] = [];
|
||
let workflowBuildSuccess = false;
|
||
let workflowId: string | undefined;
|
||
let buildError: string | undefined;
|
||
let threadId: string | undefined;
|
||
let workflowChecks: CheckOutcome[] | undefined;
|
||
|
||
for (const scenario of testCase.executionScenarios) {
|
||
const run = byKey.get(`${String(iter)}/${fileSlug}/${scenario.name}`);
|
||
const output = run ? parseTargetOutput(run.outputs) : undefined;
|
||
if (!run || !output) {
|
||
executionScenarioResults.push({
|
||
scenario,
|
||
success: false,
|
||
score: 0,
|
||
reasoning: run ? 'Malformed run output — skipped' : 'No run result for this scenario',
|
||
});
|
||
continue;
|
||
}
|
||
if (output.buildSuccess) workflowBuildSuccess = true;
|
||
if (output.workflowId) workflowId = output.workflowId;
|
||
if (output.threadId) threadId = output.threadId;
|
||
if (!output.buildSuccess && output.reasoning) buildError = output.reasoning;
|
||
if (output.workflowChecks && !workflowChecks) workflowChecks = output.workflowChecks;
|
||
executionScenarioResults.push({
|
||
scenario,
|
||
success: output.passed,
|
||
evalResult: output.evalResult,
|
||
score: output.score,
|
||
reasoning: output.reasoning,
|
||
failureCategory: output.failureCategory,
|
||
rootCause: output.rootCause,
|
||
});
|
||
}
|
||
|
||
const transcript = threadId ? transcriptByThreadId.get(threadId) : undefined;
|
||
runResults.push({
|
||
testCase,
|
||
workflowBuildSuccess,
|
||
workflowId,
|
||
executionScenarioResults,
|
||
buildError,
|
||
threadId,
|
||
transcript,
|
||
workflowChecks,
|
||
});
|
||
}
|
||
allRunResults.push(runResults);
|
||
}
|
||
return allRunResults;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Direct mode: simple loop, no LangSmith dependency
|
||
// ---------------------------------------------------------------------------
|
||
|
||
async function runDirectLoop(config: RunConfig): Promise<MultiRunEvaluation> {
|
||
const { args, lanes, logger, prebuiltManifest } = config;
|
||
|
||
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter, args.exclude);
|
||
if (testCasesWithFiles.length === 0) {
|
||
console.log('No workflow test cases found in evaluations/data/workflows/');
|
||
return { totalRuns: 0, testCases: [] };
|
||
}
|
||
|
||
const totalScenarios = testCasesWithFiles.reduce(
|
||
(sum, { testCase }) => sum + testCase.executionScenarios.length,
|
||
0,
|
||
);
|
||
logger.info(
|
||
`Running ${String(testCasesWithFiles.length)} test case(s) with ${String(totalScenarios)} scenario(s) × ${String(args.iterations)} iteration(s) across ${String(lanes.length)} lane(s)`,
|
||
);
|
||
|
||
// Distribute test cases across lanes by source-order index. Each bucket carries
|
||
// the original index so we can re-sort lane outputs back to source order — the
|
||
// aggregator indexes per-iteration results positionally.
|
||
const indexed = testCasesWithFiles.map((tc, origIdx) => ({ tc, origIdx }));
|
||
const buckets = partitionRoundRobin(indexed, lanes.length);
|
||
|
||
// Iterations are independent — run them in parallel.
|
||
const allRunResults: WorkflowTestCaseResult[][] = await Promise.all(
|
||
Array.from({ length: args.iterations }, async (_unused, iter) => {
|
||
if (args.iterations > 1) {
|
||
logger.info(`--- Iteration #${String(iter + 1)}/${String(args.iterations)} starting ---`);
|
||
}
|
||
const laneResults = await Promise.all(
|
||
lanes.map(async (lane, laneIdx) => {
|
||
const bucket = buckets[laneIdx];
|
||
const laneTag =
|
||
lanes.length > 1 ? ` [lane ${String(laneIdx + 1)}/${String(lanes.length)}]` : '';
|
||
const results = await runWithConcurrency(
|
||
bucket,
|
||
async ({ tc }) =>
|
||
await runWorkflowTestCase({
|
||
client: lane.client,
|
||
testCase: tc.testCase,
|
||
timeoutMs: args.timeoutMs,
|
||
seededCredentialTypes: lane.seedResult.seededTypes,
|
||
preRunWorkflowIds: lane.preRunWorkflowIds,
|
||
claimedWorkflowIds: lane.claimedWorkflowIds,
|
||
logger,
|
||
keepWorkflows: args.keepWorkflows,
|
||
laneTag,
|
||
prebuiltWorkflowId: pickPrebuiltWorkflowId(prebuiltManifest, tc.fileSlug, iter),
|
||
}),
|
||
MAX_CONCURRENT_BUILDS,
|
||
);
|
||
return bucket.map((b, i) => ({ origIdx: b.origIdx, result: results[i] }));
|
||
}),
|
||
);
|
||
const flat = laneResults.flat();
|
||
flat.sort((a, b) => a.origIdx - b.origIdx);
|
||
return flat.map((x) => x.result);
|
||
}),
|
||
);
|
||
|
||
return aggregateResults(allRunResults, args.iterations);
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// eval-results.json output (same shape as CI PR comment expects)
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Flatten per-iteration runs into a single list of test-case results for the
|
||
* HTML report. Previously we rendered only `tc.runs[0]`, which silently hid
|
||
* iterations 2..N — a flaky scenario that passed once and failed twice would
|
||
* appear clean in the uploaded artifact. For multi-iteration runs we prefix
|
||
* the opening user turn with its iteration number so the cards are
|
||
* distinguishable at a glance.
|
||
*/
|
||
function flattenRunsForReport(evaluation: MultiRunEvaluation): WorkflowTestCaseResult[] {
|
||
if (evaluation.totalRuns <= 1) {
|
||
return evaluation.testCases.map((tc) => tc.runs[0]);
|
||
}
|
||
return evaluation.testCases.flatMap((tc) =>
|
||
tc.runs.map((run, iter) => {
|
||
const [opening, ...rest] = run.testCase.conversation;
|
||
return {
|
||
...run,
|
||
testCase: {
|
||
...run.testCase,
|
||
conversation: [
|
||
{
|
||
...opening,
|
||
text: `[iter ${String(iter + 1)}/${String(evaluation.totalRuns)}] ${opening.text}`,
|
||
},
|
||
...rest,
|
||
],
|
||
},
|
||
};
|
||
}),
|
||
);
|
||
}
|
||
|
||
interface AggregateMetrics {
|
||
/** Number of test cases with at least one successful build across iterations. */
|
||
built: number;
|
||
/** Total scenarios across all test cases. */
|
||
scenariosTotal: number;
|
||
/** Mean pass@k across scenarios at k = totalRuns (0..1). */
|
||
passAtK: number;
|
||
/** Mean pass^k across scenarios at k = totalRuns (0..1). */
|
||
passHatK: number;
|
||
/** Index into each scenario's passAtK/passHatK array for k = totalRuns. */
|
||
kIndex: number;
|
||
/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
|
||
passRatePerIter: string;
|
||
}
|
||
|
||
function computeAggregateMetrics(evaluation: MultiRunEvaluation): AggregateMetrics {
|
||
const { totalRuns, testCases } = evaluation;
|
||
const allScenarios = testCases.flatMap((tc) => tc.executionScenarios);
|
||
const total = allScenarios.length;
|
||
const kIndex = Math.max(totalRuns - 1, 0);
|
||
const built = testCases.filter((tc) => tc.buildSuccessCount > 0).length;
|
||
const passAtK =
|
||
total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passAtK[kIndex] ?? 0), 0) / total : 0;
|
||
const passHatK =
|
||
total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passHatK[kIndex] ?? 0), 0) / total : 0;
|
||
return {
|
||
built,
|
||
scenariosTotal: total,
|
||
passAtK,
|
||
passHatK,
|
||
kIndex,
|
||
passRatePerIter: computePassRatePerIter(evaluation),
|
||
};
|
||
}
|
||
|
||
/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
|
||
function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
|
||
const { totalRuns, testCases } = evaluation;
|
||
const allScenarios = testCases.flatMap((tc) => tc.executionScenarios);
|
||
if (allScenarios.length === 0) return '';
|
||
const rates: string[] = [];
|
||
for (let i = 0; i < totalRuns; i++) {
|
||
const passed = allScenarios.filter((s) => s.runs[i]?.success).length;
|
||
rates.push(`${String(Math.round((passed / allScenarios.length) * 100))}%`);
|
||
}
|
||
return rates.join(' / ');
|
||
}
|
||
|
||
function writeEvalResults(
|
||
evaluation: MultiRunEvaluation,
|
||
duration: number,
|
||
outputDir: string | undefined,
|
||
experimentName: string | undefined,
|
||
outcome: ComparisonOutcome | undefined,
|
||
commitSha: string | undefined,
|
||
slugByTestCase: Map<WorkflowTestCase, string> | undefined,
|
||
): { jsonPath: string; prCommentPath: string } {
|
||
const { totalRuns, testCases } = evaluation;
|
||
const metrics = computeAggregateMetrics(evaluation);
|
||
|
||
const result = outcome?.kind === 'ok' ? outcome.result : undefined;
|
||
|
||
const checksSummary = aggregateWorkflowChecks(evaluation);
|
||
|
||
const report = {
|
||
timestamp: new Date().toISOString(),
|
||
duration,
|
||
totalRuns,
|
||
experimentName,
|
||
summary: {
|
||
testCases: testCases.length,
|
||
built: metrics.built,
|
||
scenariosTotal: metrics.scenariosTotal,
|
||
passAtK: metrics.passAtK,
|
||
passHatK: metrics.passHatK,
|
||
passRatePerIter: metrics.passRatePerIter,
|
||
...(checksSummary ? { workflowChecks: checksSummary } : {}),
|
||
},
|
||
// Structured comparison payload only — the rendered markdown lives in
|
||
// the sibling `eval-pr-comment.md` file so consumers can pick the format
|
||
// they want without re-running the eval. `comparisonStatus` records why
|
||
// the comparison was skipped when applicable, so JSON consumers can
|
||
// distinguish "no baseline yet" from "regression detection broke".
|
||
comparison: result
|
||
? {
|
||
baseline: result.baseline.experimentName,
|
||
result: serializeComparison(result),
|
||
}
|
||
: undefined,
|
||
comparisonStatus: outcome?.kind ?? 'not_attempted',
|
||
comparisonError: outcome?.kind === 'fetch_failed' ? outcome.error : undefined,
|
||
testCases: testCases.map((tc) => ({
|
||
name: tc.testCase.conversation[0].text.slice(0, 70),
|
||
buildSuccessCount: tc.buildSuccessCount,
|
||
totalRuns,
|
||
workflowChecksPerRun: tc.runs.map((run) =>
|
||
run.workflowChecks ? statusMap(run.workflowChecks) : null,
|
||
),
|
||
scenarios: tc.executionScenarios.map((sa) => ({
|
||
name: sa.scenario.name,
|
||
passCount: sa.passCount,
|
||
totalRuns,
|
||
passAtK: sa.passAtK[metrics.kIndex] ?? 0,
|
||
passHatK: sa.passHatK[metrics.kIndex] ?? 0,
|
||
runs: sa.runs.map((sr) => ({
|
||
passed: sr.success,
|
||
score: sr.score,
|
||
reasoning: sr.reasoning,
|
||
failureCategory: sr.failureCategory,
|
||
rootCause: sr.rootCause,
|
||
execErrors: sr.evalResult?.errors ?? [],
|
||
evalResult: sr.evalResult,
|
||
})),
|
||
})),
|
||
})),
|
||
};
|
||
|
||
const targetDir = outputDir ?? process.cwd();
|
||
mkdirSync(targetDir, { recursive: true });
|
||
const jsonPath = join(targetDir, 'eval-results.json');
|
||
writeFileSync(jsonPath, JSON.stringify(report, null, 2));
|
||
|
||
// Always write the rendered PR comment — the markdown formatter handles
|
||
// both with-comparison and no-baseline cases. CI consumes this file
|
||
// directly; local users get a copy-pasteable artifact.
|
||
const prCommentPath = join(targetDir, 'eval-pr-comment.md');
|
||
writeFileSync(
|
||
prCommentPath,
|
||
formatComparisonMarkdown(evaluation, outcome, { commitSha, slugByTestCase }),
|
||
);
|
||
|
||
return { jsonPath, prCommentPath };
|
||
}
|
||
|
||
/**
|
||
* Convert ComparisonResult into a JSON-serializable shape (Maps don't survive
|
||
* JSON.stringify by default).
|
||
*/
|
||
function serializeComparison(result: ComparisonResult): {
|
||
pr: { experimentName: string };
|
||
baseline: { experimentName: string };
|
||
aggregate: ComparisonResult['aggregate'];
|
||
scenarios: ComparisonResult['scenarios'];
|
||
prOnly: ComparisonResult['prOnly'];
|
||
baselineOnly: ComparisonResult['baselineOnly'];
|
||
failureCategories: ComparisonResult['failureCategories'];
|
||
} {
|
||
return {
|
||
pr: result.pr,
|
||
baseline: result.baseline,
|
||
aggregate: result.aggregate,
|
||
scenarios: result.scenarios,
|
||
prOnly: result.prOnly,
|
||
baselineOnly: result.baselineOnly,
|
||
failureCategories: result.failureCategories,
|
||
};
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Comparison vs the pinned baseline experiment
|
||
// ---------------------------------------------------------------------------
|
||
|
||
/**
|
||
* Best-effort comparison. Returns a tagged outcome so the PR comment can
|
||
* distinguish "no baseline yet" / "this run IS the baseline" from a real
|
||
* regression-detection outage (LangSmith down, fetch failure). Never throws
|
||
* — the eval run is not gated on the comparison.
|
||
*/
|
||
async function tryRunComparison(config: {
|
||
lsClient: Client;
|
||
prExperimentName: string;
|
||
evaluation: MultiRunEvaluation;
|
||
testCasesWithFiles: WorkflowTestCaseWithFile[];
|
||
logger: EvalLogger;
|
||
}): Promise<ComparisonOutcome> {
|
||
const { lsClient, prExperimentName, evaluation, testCasesWithFiles, logger } = config;
|
||
|
||
try {
|
||
const baselineName = await findLatestBaseline(lsClient);
|
||
if (!baselineName) {
|
||
logger.verbose(
|
||
'No baseline experiment found — skipping comparison. ' +
|
||
'Run with --experiment-name instance-ai-baseline to create one.',
|
||
);
|
||
return { kind: 'no_baseline' };
|
||
}
|
||
if (baselineName === prExperimentName) {
|
||
logger.verbose('Current run is the baseline — skipping comparison.');
|
||
return { kind: 'self_baseline', experimentName: baselineName };
|
||
}
|
||
|
||
logger.info(`Comparing against baseline: ${baselineName}`);
|
||
const baseline = await fetchBaselineBucket(lsClient, baselineName);
|
||
const pr = bucketFromEvaluation(evaluation, testCasesWithFiles, prExperimentName);
|
||
return { kind: 'ok', result: compareBuckets(pr, baseline) };
|
||
} catch (error: unknown) {
|
||
const msg = error instanceof Error ? error.message : String(error);
|
||
logger.warn(`Comparison vs baseline failed: ${msg}`);
|
||
return { kind: 'fetch_failed', error: msg };
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Project the in-memory MultiRunEvaluation onto the bucket shape used by
|
||
* fetchBaselineBucket, keyed by `${fileSlug}/${scenarioName}`.
|
||
*
|
||
* Looks up `fileSlug` by test case reference rather than array index — the
|
||
* comparison key depends on getting the right slug, and zipping by index
|
||
* silently miscompares if anything ever reorders the aggregate.
|
||
*/
|
||
function bucketFromEvaluation(
|
||
evaluation: MultiRunEvaluation,
|
||
testCasesWithFiles: WorkflowTestCaseWithFile[],
|
||
experimentName: string,
|
||
): ExperimentBucket {
|
||
const slugByTestCase = new Map(
|
||
testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
|
||
);
|
||
const scenarios = new Map<string, ScenarioCounts>();
|
||
const failureCategoryTotals: Record<string, number> = {};
|
||
let trialTotal = 0;
|
||
for (const tc of evaluation.testCases) {
|
||
const fileSlug = slugByTestCase.get(tc.testCase);
|
||
if (!fileSlug) {
|
||
throw new Error(
|
||
`bucketFromEvaluation: no fileSlug for test case "${tc.testCase.conversation[0].text.slice(0, 60)}"`,
|
||
);
|
||
}
|
||
const total = tc.runs.length;
|
||
for (const sa of tc.executionScenarios) {
|
||
const key = `${fileSlug}/${sa.scenario.name}`;
|
||
const failureCategories: Record<string, number> = {};
|
||
for (const sr of sa.runs) {
|
||
trialTotal++;
|
||
if (!sr.success && sr.failureCategory) {
|
||
failureCategories[sr.failureCategory] = (failureCategories[sr.failureCategory] ?? 0) + 1;
|
||
failureCategoryTotals[sr.failureCategory] =
|
||
(failureCategoryTotals[sr.failureCategory] ?? 0) + 1;
|
||
}
|
||
}
|
||
scenarios.set(key, {
|
||
testCaseFile: fileSlug,
|
||
scenarioName: sa.scenario.name,
|
||
passed: sa.passCount,
|
||
total,
|
||
failureCategories,
|
||
});
|
||
}
|
||
}
|
||
return { experimentName, scenarios, failureCategoryTotals, trialTotal };
|
||
}
|
||
|
||
main().catch((error) => {
|
||
console.error('Fatal error:', error);
|
||
process.exit(1);
|
||
});
|