feat(ai-builder): Add LangSmith integration for workflow eval tracking (no-changelog) (#28835)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 16:10:30 +02:00 · 2026-04-23 09:47:02 +01:00 · 2026-04-23 09:47:02 +01:00 · 16e5f9572f
commit 16e5f9572f
parent 639e1dab1c
12 changed files with 1825 additions and 153 deletions
--- a/.github/workflows/test-evals-instance-ai.yml
+++ b/.github/workflows/test-evals-instance-ai.yml
@ -91,11 +91,17 @@ jobs:
        run: >-
          pnpm eval:instance-ai
          --base-url http://localhost:5678
+          --concurrency 4
          --verbose
-          --runs 5
+          --iterations 3
          ${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }}
        env:
          N8N_INSTANCE_AI_MODEL_API_KEY: ${{ secrets.EVALS_ANTHROPIC_KEY }}
+          LANGSMITH_TRACING: 'true'
+          LANGSMITH_ENDPOINT: ${{ secrets.EVALS_LANGSMITH_ENDPOINT }}
+          LANGSMITH_API_KEY: ${{ secrets.EVALS_LANGSMITH_API_KEY }}
+          LANGSMITH_REVISION_ID: ${{ github.sha }}
+          LANGSMITH_BRANCH: ${{ github.head_ref || github.ref_name }}

      - name: Stop n8n container
        if: ${{ always() }}
@ -115,7 +121,7 @@ jobs:
          # Build the full comment body with jq
          jq -r '
            "### Instance AI Workflow Eval Results\n\n" +
-            "**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)%**\n\n" +
+            "**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)% | iterations: \(.summary.passRatePerIter)**\n\n" +
            "| Workflow | Build | pass@\(.totalRuns) | pass^\(.totalRuns) |\n|---|---|---|---|\n" +
            ([.testCases[] as $tc | "| \($tc.name) | \($tc.buildSuccessCount)/\($tc.totalRuns) | \(([$tc.scenarios[] | .passAtK] | add) / ($tc.scenarios | length) * 100 | floor)% | \(([$tc.scenarios[] | .passHatK] | add) / ($tc.scenarios | length) * 100 | floor)% |"] | join("\n")) +
            "\n\n<details><summary>Failure details</summary>\n\n" +
@ -138,5 +144,7 @@ jobs:
        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
        with:
          name: instance-ai-workflow-eval-results
-          path: packages/@n8n/instance-ai/eval-results.json
+          path: |
+            packages/@n8n/instance-ai/eval-results.json
+            packages/@n8n/instance-ai/.data/workflow-eval-report.html
          retention-days: 14
--- a/packages/@n8n/instance-ai/evaluations/README.md
+++ b/packages/@n8n/instance-ai/evaluations/README.md
@ -19,7 +19,32 @@ dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --filter contact-for
 dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --filter contact-form --keep-workflows --verbose
 ```

-Results are printed to the console and written to `eval-results.json`.
+### Outputs
+
+Every run produces three artifacts:
+
+- **Console** — live progress, per-scenario pass/fail with `[failure_category]` tag, and a grouped summary at the end.
+- **`eval-results.json`** — structured results in the current working directory. Consumed by the CI PR comment.
+- **`.data/workflow-eval-report.html`** — rich debugging view with per-node execution traces, intercepted requests, mock responses, Phase 1 hints, and verifier reasoning. Self-contained HTML you can open in a browser.
+
+If `LANGSMITH_API_KEY` is set, results are also sent to LangSmith as an experiment for historical comparison.
+
+### CLI flags
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--verbose` | `false` | Log build/execute/verify timing and SSE events |
+| `--filter` | — | Filter test cases by filename substring (e.g. `contact-form`) |
+| `--keep-workflows` | `false` | Don't delete built workflows after the run |
+| `--base-url` | `http://localhost:5678` | n8n instance URL |
+| `--email` | E2E test owner | Override login email (also via `N8N_EVAL_EMAIL`) |
+| `--password` | E2E test owner | Override login password (also via `N8N_EVAL_PASSWORD`) |
+| `--timeout-ms` | `600000` | Per-test-case timeout |
+| `--output-dir` | cwd | Where to write `eval-results.json` |
+| `--dataset` | `instance-ai-workflow-evals` | LangSmith dataset name |
+| `--concurrency` | `16` | Max concurrent scenarios (builds are separately capped at 4) |
+| `--experiment-name` | auto | LangSmith experiment prefix (defaults to `{branch}-{sha}` in CI or `local-{branch}-{sha}-dirty?` locally) |
+| `--iterations` | `1` | Run each test case N times with fresh builds — powers pass@k / pass^k metrics |

 ### Docker (without pnpm dev:ai)

@ -36,6 +61,11 @@ docker run -d --name n8n-eval \
  -p 5678:5678 \
  n8nio/n8n:local

+# Seed the test user
+curl -sf -X POST http://localhost:5678/rest/e2e/reset \
+  -H "Content-Type: application/json" \
+  -d '{"owner":{"email":"nathan@n8n.io","password":"PlaywrightTest123","firstName":"Eval","lastName":"Owner"},"admin":{"email":"admin@n8n.io","password":"PlaywrightTest123","firstName":"Admin","lastName":"User"},"members":[],"chat":{"email":"chat@n8n.io","password":"PlaywrightTest123","firstName":"Chat","lastName":"User"}}'
+
 # Run evals against it
 pnpm eval:instance-ai --base-url http://localhost:5678 --verbose
 ```
@ -44,17 +74,19 @@ pnpm eval:instance-ai --base-url http://localhost:5678 --verbose

 Evals run automatically on PRs that change Instance AI code (path-filtered). The CI workflow starts a single Docker container and runs the CLI against it. See `.github/workflows/test-evals-instance-ai.yml`.

-The eval job is **non-blocking**. Results are posted as a PR comment and uploaded as artifacts.
+The eval job is **non-blocking**. Results are posted as a PR comment and uploaded as artifacts. When `LANGSMITH_API_KEY` is set (via the `EVALS_LANGSMITH_API_KEY` secret), the run also lands as an experiment in LangSmith with commit SHA + branch tagged.

 ### Environment variables

-Set these in `.env.local`:
-
 | Variable | Required | Description |
 |----------|----------|-------------|
 | `N8N_INSTANCE_AI_MODEL_API_KEY` | Yes | Anthropic API key for the Instance AI agent, mock generation, and verification |
 | `N8N_EVAL_EMAIL` | No | n8n login email (defaults to E2E test owner) |
 | `N8N_EVAL_PASSWORD` | No | n8n login password (defaults to E2E test owner) |
+| `LANGSMITH_API_KEY` | No | Enables LangSmith experiment tracking + tracing. Without it, the CLI still runs and writes JSON/HTML. |
+| `LANGSMITH_ENDPOINT` | No | LangSmith region endpoint (`https://api.smith.langchain.com` for US, `https://eu.api.smith.langchain.com` for EU) |
+| `LANGSMITH_REVISION_ID` | No | Commit SHA to tag the experiment with (set automatically in CI) |
+| `LANGSMITH_BRANCH` | No | Branch name to tag the experiment with (set automatically in CI) |
 | `CONTEXT7_API_KEY` | No | Context7 API key for higher rate limits on API doc lookups. Free tier is 1,000 req/month |

 ## How it works
@ -68,11 +100,15 @@ Each test run:

 ### What gets mocked

- **Mocked nodes** — any node that makes HTTP requests (Gmail, Slack, Google Sheets, HTTP Request, etc.). The request is intercepted before it leaves the process. An LLM generates the response.
- **Pinned nodes** — trigger/start nodes get LLM-generated input data injected as pin data
- **Real nodes** — logic nodes (Code, Set, Merge, Filter, Sort, IF, Switch) execute their actual code on the mocked/pinned data
+- **Mocked nodes** — any node that makes HTTP requests (Gmail, Slack, Google Sheets, HTTP Request, Notion, etc.). The request is intercepted before it leaves the process. An LLM generates the response.
+- **Pinned nodes** — nodes that don't go through the HTTP layer: trigger/webhook nodes, LangChain/AI nodes (they use SDKs directly), database nodes. These receive LLM-generated data as pin data.
+- **Real nodes** — logic nodes (Code, Set, Merge, Filter, IF, Switch) execute their actual code on the mocked/pinned data.

-No real credentials or API connections are needed.
+No real credentials or API connections are needed. ~95% of node types are covered; the main gaps are binary-data nodes (file attachments, image generation) and streaming nodes.
+
+## LangSmith integration
+
+When `LANGSMITH_API_KEY` is set, each run is recorded as a LangSmith experiment against the `instance-ai-workflow-evals` dataset (synced from the JSON files before each run). Experiments against the same dataset can be compared side-by-side to spot regressions.

 ## Adding test cases

@ -128,13 +164,15 @@ When a scenario fails, the verifier categorizes the root cause:
 ```
 evaluations/
 ├── index.ts              # Public API
-├── cli/                  # CLI entry point and args parsing
+├── cli/                  # CLI entry point, arg parsing, CI metadata
 ├── clients/              # n8n REST + SSE clients
 ├── checklist/            # LLM verification with retry
 ├── credentials/          # Test credential seeding
 ├── data/workflows/       # Test case JSON files
 ├── harness/              # Runner: buildWorkflow, executeScenario, cleanupBuild
+├── langsmith/            # Dataset sync + experiment setup
 ├── outcome/              # SSE event parsing, workflow discovery
+├── report/               # HTML report generator
 └── system-prompts/       # LLM prompts for verification

 packages/cli/src/modules/instance-ai/eval/
@ -149,6 +187,8 @@ packages/cli/src/modules/instance-ai/eval/
 ## Known limitations

 - **LangChain/AI nodes** — use their own SDKs, not intercepted by the HTTP mock layer. These nodes will fail with credential errors. Use pin data for these.
+- **Binary / file nodes** — media attachments, image generation, file downloads. Mock metadata works but realistic binary content is out of scope.
+- **Streaming nodes** — our mock returns complete responses, not streams.
 - **GraphQL APIs** — response shape depends on the query, not just the endpoint. Quality depends on the LLM knowing the API schema.
 - **Context7 quota** — free tier is 1,000 requests/month, 60/hour. A full suite run uses ~100 requests. When quota is exceeded, the LLM falls back to its training data.
 - **Non-determinism** — the agent builds different workflows each run. Pass rates vary between 40-65%.
--- a/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
+++ b/packages/@n8n/instance-ai/evaluations/checklist/verifier.ts
@ -26,6 +26,7 @@ const checklistResultSchema = z.object({
 // ---------------------------------------------------------------------------

 const MAX_VERIFY_ATTEMPTS = 2;
+const VERIFY_ATTEMPT_TIMEOUT_MS = 120_000;

 export async function verifyChecklist(
 	checklist: ChecklistItem[],
@ -47,13 +48,28 @@ Verify each checklist item against the artifact above.`;

 	const validIds = new Set(llmItems.map((i) => i.id));

-	for (let attempt = 0; attempt < MAX_VERIFY_ATTEMPTS; attempt++) {
+	for (let attempt = 1; attempt <= MAX_VERIFY_ATTEMPTS; attempt++) {
 		const agent = createEvalAgent('eval-checklist-verifier', {
 			instructions: MOCK_EXECUTION_VERIFY_PROMPT,
 			cache: true,
 		}).structuredOutput(checklistResultSchema);

-		const result = await agent.generate(userMessage);
+		const abortController = new AbortController();
+		const timer = setTimeout(
+			() =>
+				abortController.abort(new Error(`verifier timed out after ${VERIFY_ATTEMPT_TIMEOUT_MS}ms`)),
+			VERIFY_ATTEMPT_TIMEOUT_MS,
+		);
+		let result;
+		try {
+			result = await agent.generate(userMessage, { abortSignal: abortController.signal });
+		} catch (error: unknown) {
+			const msg = error instanceof Error ? error.message : String(error);
+			console.warn(`[verifier] attempt ${attempt}/${MAX_VERIFY_ATTEMPTS} failed: ${msg}`);
+			continue;
+		} finally {
+			clearTimeout(timer);
+		}

 		const parsed = result.structuredOutput as z.infer<typeof checklistResultSchema> | undefined;
 		const results: ChecklistResult[] = [];
@ -82,7 +98,12 @@ Verify each checklist item against the artifact above.`;
 			results.sort((a, b) => a.id - b.id);
 			return results;
 		}
+
+		console.warn(
+			`[verifier] attempt ${attempt}/${MAX_VERIFY_ATTEMPTS} produced no parseable results`,
+		);
 	}

+	console.warn(`[verifier] exhausted ${MAX_VERIFY_ATTEMPTS} attempts, returning empty result`);
 	return [];
 }
--- a/packages/@n8n/instance-ai/evaluations/cli/aggregator.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/aggregator.ts
@ -23,7 +23,7 @@ function combinations(n: number, k: number): number {
 * Probability that at least 1 of k randomly chosen samples passes,
 * given n total samples of which c passed.
 */
-function passAtK(n: number, c: number, k: number): number {
+export function passAtK(n: number, c: number, k: number): number {
 	if (k > n) return 0;
 	const denominator = combinations(n, k);
 	if (denominator === 0) return 0;
@ -35,7 +35,7 @@ function passAtK(n: number, c: number, k: number): number {
 * Probability that all k independent attempts pass,
 * given observed success rate p = c/n.
 */
-function passHatK(n: number, c: number, k: number): number {
+export function passHatK(n: number, c: number, k: number): number {
 	if (n === 0) return 0;
 	return Math.pow(c / n, k);
 }
--- a/packages/@n8n/instance-ai/evaluations/cli/args.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/args.ts
@ -12,7 +12,7 @@ import { z } from 'zod';
 // ---------------------------------------------------------------------------

 export interface CliArgs {
-	/** TimeoutMs is defined per run, not as the total timeout for all the runs */
+	/** TimeoutMs is defined per iteration, not as the total timeout for all iterations */
 	timeoutMs: number;
 	baseUrl: string;
 	email?: string;
@ -24,8 +24,15 @@ export interface CliArgs {
 	keepWorkflows: boolean;
 	/** Directory to write eval-results.json (defaults to cwd) */
 	outputDir?: string;
-	/** Number of times to run each test case (default: 1) */
-	runs: number;
+	/** LangSmith dataset name (synced from JSON test cases before each run) */
+	dataset: string;
+	/** Max concurrent scenarios in evaluate(). Builds are separately limited to 4 by semaphore. */
+	concurrency: number;
+	/** LangSmith experiment name prefix (auto-generated if not set) */
+	experimentName?: string;
+	/** Number of iterations to run each test case (default: 1). Each iteration
+	 *  gets a fresh build so pass@k / pass^k capture real builder variance. */
+	iterations: number;
 }

 // ---------------------------------------------------------------------------
@ -41,7 +48,10 @@ const cliArgsSchema = z.object({
 	filter: z.string().optional(),
 	keepWorkflows: z.boolean().default(false),
 	outputDir: z.string().optional(),
-	runs: z.number().int().positive().default(1),
+	dataset: z.string().default('instance-ai-workflow-evals'),
+	concurrency: z.number().int().positive().default(16),
+	experimentName: z.string().optional(),
+	iterations: z.number().int().positive().default(1),
 });

 // ---------------------------------------------------------------------------
@ -61,7 +71,10 @@ export function parseCliArgs(argv: string[]): CliArgs {
 		filter: validated.filter,
 		keepWorkflows: validated.keepWorkflows,
 		outputDir: validated.outputDir,
-		runs: validated.runs,
+		dataset: validated.dataset,
+		concurrency: validated.concurrency,
+		experimentName: validated.experimentName,
+		iterations: validated.iterations,
 	};
 }

@ -78,7 +91,10 @@ interface RawArgs {
 	filter?: string;
 	keepWorkflows: boolean;
 	outputDir?: string;
-	runs: number;
+	dataset: string;
+	concurrency: number;
+	experimentName?: string;
+	iterations: number;
 }

 function parseRawArgs(argv: string[]): RawArgs {
@ -88,7 +104,10 @@ function parseRawArgs(argv: string[]): RawArgs {
 		verbose: false,
 		keepWorkflows: false,
 		outputDir: undefined,
-		runs: 1,
+		dataset: 'instance-ai-workflow-evals',
+		concurrency: 16,
+		experimentName: undefined,
+		iterations: 1,
 	};

 	for (let i = 0; i < argv.length; i++) {
@ -130,16 +149,39 @@ function parseRawArgs(argv: string[]): RawArgs {

 			case '--output-dir':
 				result.outputDir = nextArg(argv, i, '--output-dir');
+				i++;
 				break;

-			case '--runs':
-				result.runs = parseIntArg(argv, i, '--runs');
+			case '--iterations':
+				result.iterations = parseIntArg(argv, i, '--iterations');
+				i++;
+				break;
+
+			case '--dataset':
+				result.dataset = nextArg(argv, i, '--dataset');
+				i++;
+				break;
+
+			case '--concurrency':
+				result.concurrency = parseIntArg(argv, i, '--concurrency');
+				i++;
+				break;
+
+			case '--experiment-name':
+				result.experimentName = nextArg(argv, i, '--experiment-name');
 				i++;
 				break;

 			default:
-				// Ignore unknown flags
-				break;
+				// Fail loudly on unknown flags. Strip any =value payload before
+				// echoing and drop positional values entirely — raw CLI input
+				// may contain secrets (e.g. --password=... or an accidentally
+				// pasted token) that would otherwise leak into terminal/CI logs.
+				if (arg.startsWith('--')) {
+					const flagName = arg.split('=', 1)[0];
+					throw new Error(`Unknown flag: ${flagName}`);
+				}
+				throw new Error('Unexpected positional argument');
 		}
 	}

@ -162,7 +204,8 @@ function parseIntArg(argv: string[], currentIndex: number, flagName: string): nu
 	const raw = nextArg(argv, currentIndex, flagName);
 	const parsed = parseInt(raw, 10);
 	if (Number.isNaN(parsed)) {
-		throw new Error(`Invalid integer for ${flagName}: ${raw}`);
+		// Don't echo raw — a bad shell expansion could leak a secret here.
+		throw new Error(`Invalid integer for ${flagName}`);
 	}
 	return parsed;
 }
--- a/packages/@n8n/instance-ai/evaluations/cli/ci-metadata.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/ci-metadata.ts
@ -0,0 +1,82 @@
+/**
+ * CI Metadata for LangSmith experiments.
+ *
+ * Distinguishes CI runs from local development runs and tracks provenance
+ * of automated evaluation results.
+ *
+ * Note: git info (commit SHA, branch) is tracked by LangSmith automatically
+ * via LANGSMITH_REVISION_ID and LANGSMITH_BRANCH env vars — set them in the
+ * CI workflow and the SDK picks them up.
+ */
+
+import { execSync } from 'node:child_process';
+
+export interface CIMetadata {
+	source: 'ci' | 'local';
+	/** GitHub Actions event that triggered this run (e.g., 'pull_request', 'merge_group', 'workflow_dispatch') */
+	trigger?: string;
+	/** GitHub Actions run ID for linking back to the workflow run */
+	runId?: string;
+}
+
+export function buildCIMetadata(): CIMetadata {
+	const isCI = process.env.GITHUB_ACTIONS === 'true';
+
+	if (!isCI) {
+		return { source: 'local' };
+	}
+
+	return {
+		source: 'ci',
+		trigger: process.env.GITHUB_EVENT_NAME,
+		runId: process.env.GITHUB_RUN_ID,
+	};
+}
+
+/**
+ * Compute an informative experiment name prefix from branch and commit info.
+ * Falls back to a generic name if no git context is available.
+ *
+ * - CI: `ci-{branch}-{short-sha}` from GitHub Actions env vars
+ * - Local: `local-{branch}-{short-sha}[-dirty]` from git, dirty suffix if there are uncommitted changes
+ * - Fallback: `instance-ai-workflow-evals`
+ *
+ * LangSmith appends its own random suffix, so this doesn't need to be unique.
+ */
+export function computeExperimentPrefix(): string {
+	const ciName = computeCIExperimentName();
+	if (ciName) return ciName;
+
+	const localName = computeLocalExperimentName();
+	if (localName) return localName;
+
+	return 'instance-ai-workflow-evals';
+}
+
+function computeCIExperimentName(): string | undefined {
+	if (process.env.GITHUB_ACTIONS !== 'true') return undefined;
+
+	const branch = process.env.GITHUB_HEAD_REF ?? process.env.GITHUB_REF_NAME;
+	const sha = process.env.GITHUB_SHA;
+	if (!branch || !sha) return undefined;
+
+	return sanitize(`ci-${branch}-${sha.slice(0, 7)}`);
+}
+
+function computeLocalExperimentName(): string | undefined {
+	try {
+		const run = (cmd: string): string =>
+			execSync(cmd, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'ignore'] }).trim();
+
+		const branch = run('git rev-parse --abbrev-ref HEAD');
+		const sha = run('git rev-parse --short HEAD');
+		const dirty = run('git status --porcelain').length > 0 ? '-dirty' : '';
+		return sanitize(`local-${branch}-${sha}${dirty}`);
+	} catch {
+		return undefined;
+	}
+}
+
+function sanitize(name: string): string {
+	return name.replace(/[^a-zA-Z0-9_.-]/g, '_').replace(/_{2,}/g, '_');
+}
--- a/packages/@n8n/instance-ai/evaluations/cli/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts
@ -1,34 +1,121 @@
 #!/usr/bin/env node
-import { mkdirSync, writeFileSync } from 'fs';
-import { join } from 'path';
+// ---------------------------------------------------------------------------
+// Instance AI workflow eval CLI
+//
+// Runs workflow execution evaluations. When LANGSMITH_API_KEY is set, uses
+// LangSmith's evaluate() for experiment tracking and tracing. Otherwise
+// falls back to a direct loop with the same eval-results.json output.
+// ---------------------------------------------------------------------------

-import { aggregateResults } from './aggregator';
+import type { InstanceAiEvalExecutionResult } from '@n8n/api-types';
+import { mkdirSync, writeFileSync } from 'fs';
+import { Client } from 'langsmith';
+import { evaluate } from 'langsmith/evaluation';
+import type { EvaluationResult } from 'langsmith/evaluation';
+import type { Example, Run } from 'langsmith/schemas';
+import { traceable } from 'langsmith/traceable';
+import pLimit from 'p-limit';
+import { join } from 'path';
+import { z } from 'zod';
+
+import { aggregateResults, passAtK, passHatK } from './aggregator';
 import { parseCliArgs } from './args';
+import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
 import { N8nClient } from '../clients/n8n-client';
 import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
-import { loadWorkflowTestCases } from '../data/workflows';
+import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
+import type { WorkflowTestCaseWithFile } from '../data/workflows';
 import { createLogger } from '../harness/logger';
-import { runWorkflowTestCase, runWithConcurrency } from '../harness/runner';
+import type { EvalLogger } from '../harness/logger';
+import {
+	buildWorkflow,
+	executeScenario,
+	cleanupBuild,
+	runWorkflowTestCase,
+	runWithConcurrency,
+	type BuildResult,
+} from '../harness/runner';
+import { syncDataset, type DatasetExampleInputs } from '../langsmith/dataset-sync';
 import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
-import type { MultiRunEvaluation, WorkflowTestCaseResult } from '../types';
+import { writeWorkflowReport } from '../report/workflow-report';
+import type {
+	MultiRunEvaluation,
+	ScenarioResult,
+	TestScenario,
+	WorkflowTestCaseResult,
+} from '../types';
+
+// n8n degrades above ~4 concurrent builds.
+const MAX_CONCURRENT_BUILDS = 4;
+
+const targetOutputSchema = z.object({
+	buildSuccess: z.boolean().default(false),
+	passed: z.boolean().default(false),
+	score: z.number().default(0),
+	reasoning: z.string().default(''),
+	workflowId: z.string().optional(),
+	failureCategory: z.string().optional(),
+	rootCause: z.string().optional(),
+	execErrors: z.array(z.string()).default([]),
+	evalResult: z.unknown().optional(),
+	/** Only set on the scenario that initiated the build. */
+	buildDurationMs: z.number().optional(),
+	execDurationMs: z.number().default(0),
+	nodeCount: z.number().default(0),
+});
+
+type TargetOutput = Omit<z.infer<typeof targetOutputSchema>, 'evalResult'> & {
+	evalResult?: InstanceAiEvalExecutionResult;
+};
+
+function isPlainObject(v: unknown): v is Record<string, unknown> {
+	return typeof v === 'object' && v !== null && !Array.isArray(v);
+}
+
+function isEvalResult(v: unknown): v is InstanceAiEvalExecutionResult {
+	if (!isPlainObject(v)) return false;
+	return (
+		typeof v.nodeResults === 'object' &&
+		v.nodeResults !== null &&
+		Array.isArray(v.errors) &&
+		typeof v.hints === 'object' &&
+		v.hints !== null
+	);
+}
+
+/** Safe-parse a run's outputs. Returns `undefined` if the row is malformed
+ *  or missing, so callers can skip it instead of treating it as a genuine
+ *  failed evaluation. Every field in the schema has a default, so an empty
+ *  or nullish raw value would otherwise parse successfully into a "failed"
+ *  shape (passed:false, score:0) — masking infra errors as builder regressions.
+ */
+function parseTargetOutput(raw: unknown): TargetOutput | undefined {
+	if (!isPlainObject(raw) || Object.keys(raw).length === 0) return undefined;
+	const parsed = targetOutputSchema.safeParse(raw);
+	if (!parsed.success) return undefined;
+	return {
+		...parsed.data,
+		evalResult: isEvalResult(parsed.data.evalResult) ? parsed.data.evalResult : undefined,
+	};
+}
+
+const runInputsSchema = z
+	.object({
+		prompt: z.string().default(''),
+		testCaseFile: z.string().default(''),
+		scenarioName: z.string().default(''),
+		/** 0-based iteration index; injected during multi-run expansion. */
+		_iteration: z.number().int().nonnegative().default(0),
+	})
+	.passthrough();
+
+/** Target input shape with the iteration index we inject for multi-run. */
+type TargetInputs = DatasetExampleInputs & { _iteration?: number };

 async function main(): Promise<void> {
 	const args = parseCliArgs(process.argv.slice(2));
-
-	const testCases = loadWorkflowTestCases(args.filter);
-	if (testCases.length === 0) {
-		console.log('No workflow test cases found in evaluations/data/workflows/');
-		return;
-	}
-
-	const totalScenarios = testCases.reduce((sum, tc) => sum + tc.scenarios.length, 0);
-	console.log(
-		`Running ${String(testCases.length)} workflow test case(s) with ${String(totalScenarios)} scenario(s) x ${String(args.runs)} runs\n`,
-	);
-
 	const logger = createLogger(args.verbose);

-	// Setup: authenticate, seed credentials, snapshot workflows
 	const client = new N8nClient(args.baseUrl);
 	logger.info(`Authenticating with ${args.baseUrl}...`);
 	await client.login(args.email, args.password);
@ -38,73 +125,640 @@ async function main(): Promise<void> {
 	const seedResult = await seedCredentials(client, undefined, logger);
 	logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)`);

-	// Run test cases with bounded concurrency.
-	// Each test case builds a workflow (uses n8n's agent) then runs scenarios
-	// (uses our Anthropic key for Phase 1 + Phase 2 mock generation).
-	const MAX_CONCURRENT_TEST_CASES = 4;
+	const preRunWorkflowIds = await snapshotWorkflowIds(client);
+	const claimedWorkflowIds = new Set<string>();
+
 	const startTime = Date.now();
-	const allRunResults: WorkflowTestCaseResult[][] = [];

 	try {
-		for (let run = 0; run < args.runs; run++) {
-			if (args.runs > 1) {
-				console.log(`\n--- Run #${String(run + 1)}/${String(args.runs)} ---\n`);
-			}
+		const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);

-			const preRunWorkflowIds = await snapshotWorkflowIds(client);
-			const claimedWorkflowIds = new Set<string>();
+		let evaluation: MultiRunEvaluation;

-			const results = await runWithConcurrency(
-				testCases,
-				async (testCase) =>
-					await runWorkflowTestCase({
-						client,
-						testCase,
-						timeoutMs: args.timeoutMs,
-						seededCredentialTypes: seedResult.seededTypes,
-						preRunWorkflowIds,
-						claimedWorkflowIds,
-						logger,
-						keepWorkflows: args.keepWorkflows,
-					}),
-				MAX_CONCURRENT_TEST_CASES,
-			);
-
-			allRunResults.push(results);
+		if (hasLangSmith) {
+			logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
+			evaluation = await runWithLangSmith({
+				args,
+				client,
+				preRunWorkflowIds,
+				claimedWorkflowIds,
+				logger,
+				seedResult,
+			});
+		} else {
+			logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
+			evaluation = await runDirectLoop({
+				args,
+				client,
+				preRunWorkflowIds,
+				claimedWorkflowIds,
+				logger,
+				seedResult,
+			});
 		}
+
+		const totalDuration = Date.now() - startTime;
+		const outputPath = writeEvalResults(evaluation, totalDuration, args.outputDir);
+		console.log(`Results: ${outputPath}`);
+		const htmlPath = writeWorkflowReport(flattenRunsForReport(evaluation));
+		console.log(`Report:  ${htmlPath}`);
+		printSummary(evaluation);
 	} finally {
 		await cleanupCredentials(client, seedResult.credentialIds).catch(() => {});
 	}
-
-	const totalDuration = Date.now() - startTime;
-	const aggregatedResults = aggregateResults(allRunResults, args.runs);
-
-	// Write eval-results.json for CI consumption (PR comments, artifacts)
-	const outputPath = writeEvalResults(aggregatedResults, totalDuration, args.outputDir);
-	console.log(`Results: ${outputPath}`);
-
-	// Print console summary
-	printSummary(aggregatedResults);
 }

-/** Write structured JSON results for CI (PR comments, artifact upload). */
+// ---------------------------------------------------------------------------
+// LangSmith mode: evaluate() with dataset sync, tracing, experiments
+// ---------------------------------------------------------------------------
+
+interface RunConfig {
+	args: ReturnType<typeof parseCliArgs>;
+	client: N8nClient;
+	preRunWorkflowIds: Set<string>;
+	claimedWorkflowIds: Set<string>;
+	logger: EvalLogger;
+	seedResult: { seededTypes: string[]; credentialIds: string[] };
+}
+
+async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation> {
+	const { args, client, preRunWorkflowIds, claimedWorkflowIds, logger } = config;
+
+	const lsClient = new Client();
+	const datasetName = await syncDataset(lsClient, args.dataset, logger, args.filter);
+	const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter);
+
+	const buildLimiter = pLimit(MAX_CONCURRENT_BUILDS);
+	// Keyed by `${iteration}:${prompt}` so the same prompt gets a fresh build
+	// per iteration — pass@k captures real builder variance.
+	const buildCache = new Map<string, Promise<BuildResult>>();
+	const buildDurations = new Map<string, number>();
+
+	// Traceable wraps the actual build call *inside* the limiter — otherwise the
+	// LangSmith span would include queue-wait time, which accumulates across
+	// iterations as later builds queue behind earlier ones.
+	const tracedBuildWorkflow = traceable(
+		async (prompt: string) =>
+			await buildWorkflow({
+				client,
+				prompt,
+				timeoutMs: args.timeoutMs,
+				preRunWorkflowIds,
+				claimedWorkflowIds,
+				logger,
+			}),
+		{ name: 'workflow_build', run_type: 'chain', client: lsClient },
+	);
+
+	async function getOrBuild(
+		prompt: string,
+		iteration: number,
+	): Promise<{ build: BuildResult; buildDurationMs?: number }> {
+		const key = `${String(iteration)}:${prompt}`;
+		const existing = buildCache.get(key);
+		if (existing) return { build: await existing };
+		const promise = buildLimiter(async () => {
+			const start = Date.now();
+			const build = await tracedBuildWorkflow(prompt);
+			buildDurations.set(key, Date.now() - start);
+			return build;
+		});
+		buildCache.set(key, promise);
+		const build = await promise;
+		return { build, buildDurationMs: buildDurations.get(key) };
+	}
+
+	const traceableExecute = traceable(
+		async (execArgs: {
+			workflowId: string;
+			scenario: TestScenario;
+			workflowJsons: BuildResult['workflowJsons'];
+		}) =>
+			await executeScenario(
+				client,
+				execArgs.workflowId,
+				execArgs.scenario,
+				execArgs.workflowJsons,
+				logger,
+			),
+		{ name: 'scenario_execution', run_type: 'chain', client: lsClient },
+	);
+
+	const target = async (inputs: TargetInputs): Promise<TargetOutput> => {
+		const iteration = inputs._iteration ?? 0;
+		const scenario: TestScenario = {
+			name: inputs.scenarioName,
+			description: inputs.scenarioDescription,
+			dataSetup: inputs.dataSetup,
+			successCriteria: inputs.successCriteria,
+		};
+
+		const { build, buildDurationMs } = await getOrBuild(inputs.prompt, iteration);
+
+		if (!build.success || !build.workflowId) {
+			return {
+				buildSuccess: false,
+				passed: false,
+				score: 0,
+				reasoning: `Build failed: ${build.error ?? 'unknown'}`,
+				failureCategory: 'build_failure',
+				execErrors: build.error ? [build.error] : [],
+				buildDurationMs,
+				execDurationMs: 0,
+				nodeCount: 0,
+			};
+		}
+
+		const execStart = Date.now();
+		const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
+		let result;
+		try {
+			result = await traceableExecute({
+				workflowId: build.workflowId,
+				scenario,
+				workflowJsons: build.workflowJsons,
+			});
+		} catch (error: unknown) {
+			// Mirror direct mode's per-scenario guard — without this, n8n API errors
+			// or verifier timeouts from executeWithLlmMock / verifyChecklist would
+			// escape to LangSmith, come back as a Run with null outputs, and be
+			// misclassified as builder regressions by the feedback extractor.
+			const errorMessage = error instanceof Error ? error.message : String(error);
+			logger.error(`    ERROR [${scenario.name}]: ${errorMessage}`);
+			return {
+				buildSuccess: true,
+				workflowId: build.workflowId,
+				passed: false,
+				score: 0,
+				reasoning: `Scenario execution error: ${errorMessage}`,
+				failureCategory: 'framework_issue',
+				execErrors: [errorMessage],
+				buildDurationMs,
+				execDurationMs: Date.now() - execStart,
+				nodeCount,
+			};
+		}
+		const execDurationMs = Date.now() - execStart;
+
+		// Strip failure fields on pass: the verifier sometimes returns "."
+		// placeholders instead of omitting them.
+		const failureCategory = result.success ? undefined : result.failureCategory;
+		const rootCause = result.success ? undefined : result.rootCause;
+
+		return {
+			buildSuccess: true,
+			workflowId: build.workflowId,
+			passed: result.success,
+			score: result.score,
+			reasoning: result.reasoning,
+			failureCategory,
+			rootCause,
+			execErrors: result.evalResult?.errors ?? [],
+			evalResult: result.evalResult,
+			buildDurationMs,
+			execDurationMs,
+			nodeCount,
+		};
+	};
+
+	const feedbackExtractor = ({ run }: { run: Run }): EvaluationResult[] => {
+		const output = parseTargetOutput(run.outputs);
+		if (!output) return [];
+		// 'none' for passed scenarios so the column shows a full categorical
+		// breakdown instead of blank cells.
+		const failureCategory = output.passed ? 'none' : (output.failureCategory ?? 'unknown');
+		const feedback: EvaluationResult[] = [
+			{
+				key: 'scenario_pass',
+				score: output.score,
+				comment: output.reasoning || undefined,
+			},
+			{
+				key: 'failure_category',
+				value: failureCategory,
+			},
+			{
+				key: 'exec_duration_s',
+				score: output.execDurationMs / 1000,
+			},
+			{
+				key: 'node_count',
+				score: output.nodeCount,
+			},
+		];
+		if (output.buildDurationMs !== undefined) {
+			feedback.push({ key: 'build_duration_s', score: output.buildDurationMs / 1000 });
+		}
+		return feedback;
+	};
+
+	const experimentPrefix = args.experimentName ?? computeExperimentPrefix();
+
+	logger.info(
+		`Starting evaluate() with concurrency=${String(args.concurrency)}, builds limited to ${String(MAX_CONCURRENT_BUILDS)}, iterations=${String(args.iterations)}`,
+	);
+
+	const sourceExamples = args.filter
+		? filteredExamplesIterable(lsClient, datasetName, args.filter, logger)
+		: lsClient.listExamples({ datasetName });
+	const evaluateData =
+		args.iterations > 1
+			? expandExamplesForIterations(sourceExamples, args.iterations)
+			: sourceExamples;
+
+	try {
+		const evaluateStart = Date.now();
+		const experimentResults = await evaluate(target, {
+			data: evaluateData,
+			evaluators: [feedbackExtractor],
+			experimentPrefix,
+			maxConcurrency: args.concurrency,
+			client: lsClient,
+			metadata: {
+				filter: args.filter ?? 'all',
+				concurrency: args.concurrency,
+				maxBuilds: MAX_CONCURRENT_BUILDS,
+				iterations: args.iterations,
+				...buildCIMetadata(),
+			},
+		});
+		const totalDurationMs = Date.now() - evaluateStart;
+
+		logger.info(`Experiment: ${experimentResults.experimentName}`);
+		await lsClient.awaitPendingTraceBatches();
+
+		const allRunResults = reshapeLangSmithRuns(
+			experimentResults.results,
+			testCasesWithFiles,
+			args.iterations,
+		);
+		const evaluation = aggregateResults(allRunResults, args.iterations);
+
+		await updateExperimentAggregates({
+			lsClient,
+			experimentName: experimentResults.experimentName,
+			runs: experimentResults.results,
+			evaluation,
+			buildDurations,
+			totalDurationMs,
+			logger,
+		});
+
+		await writePerRunPassMetrics({
+			lsClient,
+			runs: experimentResults.results,
+			logger,
+		});
+
+		return evaluation;
+	} finally {
+		if (!args.keepWorkflows) {
+			await Promise.all(
+				[...buildCache.values()].map(async (buildPromise) => {
+					try {
+						const build = await buildPromise;
+						await cleanupBuild(client, build, logger);
+					} catch {
+						// Best-effort
+					}
+				}),
+			);
+		}
+	}
+}
+
+/**
+ * Expand a source example stream into N copies, tagging each with `_iteration`
+ * so the target function can key its build cache by iteration and we can
+ * reshape runs back into per-iteration groups afterwards. All N copies share
+ * the source example's id, so LangSmith's UI groups them naturally by
+ * `reference_example_id` — useful for pass@k visualization.
+ *
+ * The source is buffered into memory once before the first yield: we need to
+ * emit each example N times, and an AsyncIterable can only be consumed once.
+ */
+async function* expandExamplesForIterations(
+	source: AsyncIterable<Example>,
+	iterations: number,
+): AsyncIterable<Example> {
+	const cached: Example[] = [];
+	for await (const ex of source) cached.push(ex);
+	for (let i = 0; i < iterations; i++) {
+		for (const ex of cached) {
+			yield { ...ex, inputs: { ...ex.inputs, _iteration: i } };
+		}
+	}
+}
+
+function filteredExamplesIterable(
+	lsClient: Client,
+	datasetName: string,
+	filter: string,
+	logger: EvalLogger,
+): AsyncIterable<Example> {
+	const slugs = loadWorkflowTestCasesWithFiles(filter).map((tc) => tc.fileSlug);
+	if (slugs.length === 0) {
+		logger.info(`Filter "${filter}" matched no local test case files`);
+		return (async function* () {})();
+	}
+	logger.info(`Filter "${filter}" matched ${String(slugs.length)} split(s): ${slugs.join(', ')}`);
+	return lsClient.listExamples({ datasetName, splits: slugs });
+}
+
+async function updateExperimentAggregates(config: {
+	lsClient: Client;
+	experimentName: string;
+	runs: Array<{ run: Run }>;
+	evaluation: MultiRunEvaluation;
+	buildDurations: Map<string, number>;
+	totalDurationMs: number;
+	logger: EvalLogger;
+}): Promise<void> {
+	const { lsClient, experimentName, runs, evaluation, buildDurations, totalDurationMs, logger } =
+		config;
+
+	const buildTimes = [...buildDurations.values()];
+	const uniqueBuilds = buildTimes.length;
+	const avgBuildMs =
+		uniqueBuilds > 0 ? buildTimes.reduce((sum, d) => sum + d, 0) / uniqueBuilds : 0;
+
+	const execTimes = runs
+		.map(({ run }) => parseTargetOutput(run.outputs)?.execDurationMs)
+		.filter((ms): ms is number => typeof ms === 'number');
+	const avgExecMs =
+		execTimes.length > 0 ? execTimes.reduce((sum, d) => sum + d, 0) / execTimes.length : 0;
+
+	const aggregates = {
+		duration_s: Math.round(totalDurationMs / 100) / 10,
+		avg_build_s: Math.round(avgBuildMs / 100) / 10,
+		avg_exec_s: Math.round(avgExecMs / 100) / 10,
+		unique_builds: uniqueBuilds,
+		pass_rate_per_iter: computePassRatePerIter(evaluation),
+	};
+
+	try {
+		const project = await lsClient.readProject({ projectName: experimentName });
+		// `updateProject` replaces `extra` wholesale — preserve it so auto-set
+		// fields (splits, etc.) survive. Narrow via typeof guards rather than `as`.
+		const existingExtra = isPlainObject(project.extra) ? project.extra : {};
+		const existingMetadata = isPlainObject(existingExtra.metadata) ? existingExtra.metadata : {};
+		await lsClient.updateProject(project.id, {
+			projectExtra: existingExtra,
+			metadata: { ...existingMetadata, ...aggregates },
+		});
+		logger.verbose(`Updated experiment metadata: ${JSON.stringify(aggregates)}`);
+	} catch (error: unknown) {
+		const msg = error instanceof Error ? error.message : String(error);
+		logger.verbose(`Could not update experiment metadata: ${msg}`);
+	}
+}
+
+/**
+ * Attach per-example pass metrics (pass_rate, pass_at_k, pass_hat_k) as
+ * feedback on every run in the example's group. All N runs of the same example
+ * carry the same value — that lets the LangSmith UI sort/filter individual
+ * runs by their example's metric, and its per-experiment column aggregation
+ * reduces to the mean across unique examples.
+ */
+async function writePerRunPassMetrics(config: {
+	lsClient: Client;
+	runs: Array<{ run: Run }>;
+	logger: EvalLogger;
+}): Promise<void> {
+	const { lsClient, runs, logger } = config;
+
+	// Group runs by reference_example_id, counting passes.
+	const byExample = new Map<string, { runIds: string[]; passed: number; total: number }>();
+	for (const { run } of runs) {
+		const exampleId = run.reference_example_id;
+		if (!exampleId) continue;
+		const output = parseTargetOutput(run.outputs);
+		if (!output) continue;
+		const entry = byExample.get(exampleId) ?? { runIds: [], passed: 0, total: 0 };
+		entry.runIds.push(run.id);
+		entry.total++;
+		if (output.passed) entry.passed++;
+		byExample.set(exampleId, entry);
+	}
+
+	// Individual writes are best-effort: a transient API error on one run
+	// shouldn't block the rest, so we swallow per-promise and keep going.
+	const feedbackWrites: Array<Promise<unknown>> = [];
+	for (const { runIds, passed, total } of byExample.values()) {
+		const passAtKValue = passAtK(total, passed, total);
+		const passHatKValue = passHatK(total, passed, total);
+		for (const runId of runIds) {
+			feedbackWrites.push(
+				lsClient.createFeedback(runId, 'pass_at_k', { score: passAtKValue }).catch(() => {}),
+				lsClient.createFeedback(runId, 'pass_hat_k', { score: passHatKValue }).catch(() => {}),
+			);
+		}
+	}
+
+	await Promise.all(feedbackWrites);
+	logger.verbose(
+		`Wrote pass metrics feedback for ${String(byExample.size)} example(s) across ${String(runs.length)} run(s)`,
+	);
+}
+
+/**
+ * Convert LangSmith's flat `Run[]` into the `WorkflowTestCaseResult[][]` shape
+ * the aggregator expects (outer: runs, inner: test cases). Groups by
+ * (testCaseFile, scenarioName), then reconstructs per-iteration test case
+ * results. Scenarios with no matching run get a build_failure stub.
+ */
+function reshapeLangSmithRuns(
+	rows: Array<{ run: Run }>,
+	testCasesWithFiles: WorkflowTestCaseWithFile[],
+	numIterations: number,
+): WorkflowTestCaseResult[][] {
+	// Index runs by (iteration, testCaseFile, scenarioName) using the `_iteration`
+	// we injected in expandExamplesForIterations. Falls back to 0 for single-run.
+	const byKey = new Map<string, Run>();
+	for (const { run } of rows) {
+		const inputs = runInputsSchema.safeParse(run.inputs ?? {});
+		if (!inputs.success) continue;
+		const key = `${String(inputs.data._iteration)}/${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
+		byKey.set(key, run);
+	}
+
+	const allRunResults: WorkflowTestCaseResult[][] = [];
+	for (let iter = 0; iter < numIterations; iter++) {
+		const runResults: WorkflowTestCaseResult[] = [];
+		for (const { testCase, fileSlug } of testCasesWithFiles) {
+			const scenarioResults: ScenarioResult[] = [];
+			let workflowBuildSuccess = false;
+			let workflowId: string | undefined;
+			let buildError: string | undefined;
+
+			for (const scenario of testCase.scenarios) {
+				const run = byKey.get(`${String(iter)}/${fileSlug}/${scenario.name}`);
+				const output = run ? parseTargetOutput(run.outputs) : undefined;
+				if (!run || !output) {
+					scenarioResults.push({
+						scenario,
+						success: false,
+						score: 0,
+						reasoning: run ? 'Malformed run output — skipped' : 'No run result for this scenario',
+					});
+					continue;
+				}
+				if (output.buildSuccess) workflowBuildSuccess = true;
+				if (output.workflowId) workflowId = output.workflowId;
+				if (!output.buildSuccess && output.reasoning) buildError = output.reasoning;
+				scenarioResults.push({
+					scenario,
+					success: output.passed,
+					evalResult: output.evalResult,
+					score: output.score,
+					reasoning: output.reasoning,
+					failureCategory: output.failureCategory,
+					rootCause: output.rootCause,
+				});
+			}
+
+			runResults.push({
+				testCase,
+				workflowBuildSuccess,
+				workflowId,
+				scenarioResults,
+				buildError,
+			});
+		}
+		allRunResults.push(runResults);
+	}
+	return allRunResults;
+}
+
+// ---------------------------------------------------------------------------
+// Direct mode: simple loop, no LangSmith dependency
+// ---------------------------------------------------------------------------
+
+async function runDirectLoop(config: RunConfig): Promise<MultiRunEvaluation> {
+	const { args, client, preRunWorkflowIds, claimedWorkflowIds, logger, seedResult } = config;
+
+	const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter);
+	if (testCasesWithFiles.length === 0) {
+		console.log('No workflow test cases found in evaluations/data/workflows/');
+		return { totalRuns: 0, testCases: [] };
+	}
+
+	const totalScenarios = testCasesWithFiles.reduce(
+		(sum, { testCase }) => sum + testCase.scenarios.length,
+		0,
+	);
+	logger.info(
+		`Running ${String(testCasesWithFiles.length)} test case(s) with ${String(totalScenarios)} scenario(s) × ${String(args.iterations)} iteration(s)`,
+	);
+
+	const allRunResults: WorkflowTestCaseResult[][] = [];
+	for (let iter = 0; iter < args.iterations; iter++) {
+		if (args.iterations > 1) {
+			logger.info(`--- Iteration #${String(iter + 1)}/${String(args.iterations)} ---`);
+		}
+		const results = await runWithConcurrency(
+			testCasesWithFiles,
+			async ({ testCase }) =>
+				await runWorkflowTestCase({
+					client,
+					testCase,
+					timeoutMs: args.timeoutMs,
+					seededCredentialTypes: seedResult.seededTypes,
+					preRunWorkflowIds,
+					claimedWorkflowIds,
+					logger,
+					keepWorkflows: args.keepWorkflows,
+				}),
+			MAX_CONCURRENT_BUILDS,
+		);
+		allRunResults.push(results);
+	}
+
+	return aggregateResults(allRunResults, args.iterations);
+}
+
+// ---------------------------------------------------------------------------
+// eval-results.json output (same shape as CI PR comment expects)
+// ---------------------------------------------------------------------------
+
+/**
+ * Flatten per-iteration runs into a single list of test-case results for the
+ * HTML report. Previously we rendered only `tc.runs[0]`, which silently hid
+ * iterations 2..N — a flaky scenario that passed once and failed twice would
+ * appear clean in the uploaded artifact. For multi-iteration runs we prefix
+ * each prompt with its iteration number so the cards are distinguishable at
+ * a glance.
+ */
+function flattenRunsForReport(evaluation: MultiRunEvaluation): WorkflowTestCaseResult[] {
+	if (evaluation.totalRuns <= 1) {
+		return evaluation.testCases.map((tc) => tc.runs[0]);
+	}
+	return evaluation.testCases.flatMap((tc) =>
+		tc.runs.map((run, iter) => ({
+			...run,
+			testCase: {
+				...run.testCase,
+				prompt: `[iter ${String(iter + 1)}/${String(evaluation.totalRuns)}] ${run.testCase.prompt}`,
+			},
+		})),
+	);
+}
+
+interface AggregateMetrics {
+	/** Number of test cases with at least one successful build across iterations. */
+	built: number;
+	/** Total scenarios across all test cases. */
+	scenariosTotal: number;
+	/** Mean pass@k across scenarios at k = totalRuns (0..1). */
+	passAtK: number;
+	/** Mean pass^k across scenarios at k = totalRuns (0..1). */
+	passHatK: number;
+	/** Index into each scenario's passAtK/passHatK array for k = totalRuns. */
+	kIndex: number;
+	/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
+	passRatePerIter: string;
+}
+
+function computeAggregateMetrics(evaluation: MultiRunEvaluation): AggregateMetrics {
+	const { totalRuns, testCases } = evaluation;
+	const allScenarios = testCases.flatMap((tc) => tc.scenarios);
+	const total = allScenarios.length;
+	const kIndex = Math.max(totalRuns - 1, 0);
+	const built = testCases.filter((tc) => tc.buildSuccessCount > 0).length;
+	const passAtK =
+		total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passAtK[kIndex] ?? 0), 0) / total : 0;
+	const passHatK =
+		total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passHatK[kIndex] ?? 0), 0) / total : 0;
+	return {
+		built,
+		scenariosTotal: total,
+		passAtK,
+		passHatK,
+		kIndex,
+		passRatePerIter: computePassRatePerIter(evaluation),
+	};
+}
+
+/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
+function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
+	const { totalRuns, testCases } = evaluation;
+	const allScenarios = testCases.flatMap((tc) => tc.scenarios);
+	if (allScenarios.length === 0) return '';
+	const rates: string[] = [];
+	for (let i = 0; i < totalRuns; i++) {
+		const passed = allScenarios.filter((s) => s.runs[i]?.success).length;
+		rates.push(`${String(Math.round((passed / allScenarios.length) * 100))}%`);
+	}
+	return rates.join(' / ');
+}
+
 function writeEvalResults(
 	evaluation: MultiRunEvaluation,
 	duration: number,
 	outputDir?: string,
 ): string {
 	const { totalRuns, testCases } = evaluation;
-	const allScenarios = testCases.flatMap((tc) => tc.scenarios);
-	const totalScenariosCount = allScenarios.length;
-
-	const passAtKCount =
-		totalScenariosCount > 0
-			? allScenarios.reduce((sum, s) => sum + (s.passAtK[totalRuns - 1] ?? 0), 0)
-			: 0;
-	const passHatKCount =
-		totalScenariosCount > 0
-			? allScenarios.reduce((sum, s) => sum + (s.passHatK[totalRuns - 1] ?? 0), 0)
-			: 0;
+	const metrics = computeAggregateMetrics(evaluation);

 	const report = {
 		timestamp: new Date().toISOString(),
@ -112,10 +766,11 @@ function writeEvalResults(
 		totalRuns,
 		summary: {
 			testCases: testCases.length,
-			built: testCases.filter((tc) => tc.buildSuccessCount > 0).length,
-			scenariosTotal: totalScenariosCount,
-			passAtK: totalScenariosCount > 0 ? passAtKCount / totalScenariosCount : 0,
-			passHatK: totalScenariosCount > 0 ? passHatKCount / totalScenariosCount : 0,
+			built: metrics.built,
+			scenariosTotal: metrics.scenariosTotal,
+			passAtK: metrics.passAtK,
+			passHatK: metrics.passHatK,
+			passRatePerIter: metrics.passRatePerIter,
 		},
 		testCases: testCases.map((tc) => ({
 			name: tc.testCase.prompt.slice(0, 70),
@ -125,29 +780,36 @@ function writeEvalResults(
 				name: sa.scenario.name,
 				passCount: sa.passCount,
 				totalRuns,
-				passAtK: sa.passAtK[totalRuns - 1] ?? 0,
-				passHatK: sa.passHatK[totalRuns - 1] ?? 0,
+				passAtK: sa.passAtK[metrics.kIndex] ?? 0,
+				passHatK: sa.passHatK[metrics.kIndex] ?? 0,
 				runs: sa.runs.map((sr) => ({
 					passed: sr.success,
 					score: sr.score,
 					reasoning: sr.reasoning,
 					failureCategory: sr.failureCategory,
 					rootCause: sr.rootCause,
+					execErrors: sr.evalResult?.errors ?? [],
+					evalResult: sr.evalResult,
 				})),
 			})),
 		})),
 	};

-	const dir = outputDir ?? process.cwd();
-	mkdirSync(dir, { recursive: true });
-	const outputPath = join(dir, 'eval-results.json');
+	const targetDir = outputDir ?? process.cwd();
+	mkdirSync(targetDir, { recursive: true });
+	const outputPath = join(targetDir, 'eval-results.json');
 	writeFileSync(outputPath, JSON.stringify(report, null, 2));
 	return outputPath;
 }

+// ---------------------------------------------------------------------------
+// Console summary
+// ---------------------------------------------------------------------------
+
 function printSummary(evaluation: MultiRunEvaluation): void {
 	const { totalRuns, testCases } = evaluation;
 	const multiRun = totalRuns > 1;
+	const metrics = computeAggregateMetrics(evaluation);

 	console.log('\n=== Workflow Eval Results ===\n');
 	for (const tc of testCases) {
@ -166,8 +828,8 @@ function printSummary(evaluation: MultiRunEvaluation): void {

 		for (const sa of tc.scenarios) {
 			if (multiRun) {
-				const passAtK = Math.round((sa.passAtK[totalRuns - 1] ?? 0) * 100);
-				const passHatK = Math.round((sa.passHatK[totalRuns - 1] ?? 0) * 100);
+				const passAtK = Math.round((sa.passAtK[metrics.kIndex] ?? 0) * 100);
+				const passHatK = Math.round((sa.passHatK[metrics.kIndex] ?? 0) * 100);
 				console.log(
 					`  ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` +
 						` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`,
@ -175,52 +837,34 @@ function printSummary(evaluation: MultiRunEvaluation): void {
 			} else {
 				const sr = sa.runs[0];
 				const icon = sr.success ? '✓' : '✗';
+				const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
 				console.log(
-					`  ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'} (${String(sr.score * 100)}%)`,
+					`  ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'}${category} (${String(sr.score * 100)}%)`,
 				);
 				if (!sr.success) {
-					console.log(`    ${sr.reasoning.slice(0, 120)}`);
+					const execErrors = sr.evalResult?.errors ?? [];
+					if (execErrors.length > 0) {
+						console.log(`    Error: ${execErrors.join('; ').slice(0, 200)}`);
+					}
+					console.log(`    Diagnosis: ${sr.reasoning.slice(0, 200)}`);
 				}
 			}
 		}
+		console.log('');
 	}

-	// Aggregate metrics for multi-run
 	if (multiRun) {
+		console.log(
+			`${String(metrics.built)}/${String(testCases.length)} built | pass@${String(totalRuns)}: ${String(Math.round(metrics.passAtK * 100))}% | pass^${String(totalRuns)}: ${String(Math.round(metrics.passHatK * 100))}% | iterations: ${metrics.passRatePerIter}`,
+		);
+	} else {
 		const allScenarios = testCases.flatMap((tc) => tc.scenarios);
-		const total = allScenarios.length;
-		const avgPassAtK =
-			total > 0
-				? Math.round(
-						(allScenarios.reduce((sum, s) => sum + (s.passAtK[totalRuns - 1] ?? 0), 0) / total) *
-							100,
-					)
-				: 0;
-		const avgPassHatK =
-			total > 0
-				? Math.round(
-						(allScenarios.reduce((sum, s) => sum + (s.passHatK[totalRuns - 1] ?? 0), 0) / total) *
-							100,
-					)
-				: 0;
-
-		console.log('\n=== Aggregate Metrics ===\n');
-		console.log(`  pass@${String(totalRuns)}: ${String(avgPassAtK)}%`);
-		console.log(`  pass^${String(totalRuns)}: ${String(avgPassHatK)}%`);
+		const passed = allScenarios.filter((s) => s.runs[0]?.success).length;
+		const total = metrics.scenariosTotal;
+		console.log(
+			`${String(metrics.built)}/${String(testCases.length)} built | ${String(passed)}/${String(total)} passed (${String(total > 0 ? Math.round((passed / total) * 100) : 0)}%)`,
+		);
 	}
-
-	// Totals
-	const allScenarios = testCases.flatMap((tc) => tc.scenarios);
-	const total = allScenarios.length;
-	const built = testCases.filter((tc) => tc.buildSuccessCount > 0).length;
-	const passedTotal = multiRun
-		? allScenarios.reduce((sum, s) => sum + s.passCount, 0)
-		: allScenarios.filter((s) => s.runs[0]?.success).length;
-	const totalAttempts = multiRun ? total * totalRuns : total;
-
-	console.log(
-		`\n${String(built)}/${String(testCases.length)} built | ${String(passedTotal)}/${String(totalAttempts)} passed (${String(totalAttempts > 0 ? Math.round((passedTotal / totalAttempts) * 100) : 0)}%)`,
-	);
 }

 main().catch((error) => {
--- a/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
+++ b/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
@ -75,15 +75,10 @@ interface ThreadStatus {
 export class N8nClient {
 	private sessionCookie?: string;

-	constructor(readonly baseUrl: string) {}
+	constructor(private readonly baseUrl: string) {}

 	// -- Auth ----------------------------------------------------------------

-	/** Set the session cookie directly (for sharing across workers). */
-	setSessionCookie(cookie: string): void {
-		this.sessionCookie = cookie;
-	}
-
 	/**
 	 * Authenticate with the n8n instance via POST /rest/login.
 	 * Captures the `n8n-auth` cookie for subsequent requests.
--- a/packages/@n8n/instance-ai/evaluations/data/workflows/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/data/workflows/index.ts
@ -1,8 +1,14 @@
 import { readFileSync, readdirSync } from 'fs';
-import { join } from 'path';
+import { basename, join } from 'path';

 import type { WorkflowTestCase } from '../../types';

+export interface WorkflowTestCaseWithFile {
+	testCase: WorkflowTestCase;
+	/** Filename without extension, e.g. "contact-form-automation" */
+	fileSlug: string;
+}
+
 function parseTestCaseFile(filePath: string): WorkflowTestCase {
 	const content = readFileSync(filePath, 'utf-8');
 	try {
@ -14,11 +20,19 @@ function parseTestCaseFile(filePath: string): WorkflowTestCase {
 	}
 }

-export function loadWorkflowTestCases(filter?: string): WorkflowTestCase[] {
+function getJsonFiles(filter?: string): string[] {
 	const dir = __dirname;
 	let files = readdirSync(dir).filter((f) => f.endsWith('.json'));
 	if (filter) {
 		files = files.filter((f) => f.toLowerCase().includes(filter.toLowerCase()));
 	}
-	return files.map((f) => parseTestCaseFile(join(dir, f)));
+	return files.map((f) => join(dir, f));
+}
+
+/** Load test cases with their file slugs (for LangSmith dataset sync derived IDs). */
+export function loadWorkflowTestCasesWithFiles(filter?: string): WorkflowTestCaseWithFile[] {
+	return getJsonFiles(filter).map((f) => ({
+		testCase: parseTestCaseFile(f),
+		fileSlug: basename(f, '.json'),
+	}));
 }
--- a/packages/@n8n/instance-ai/evaluations/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/index.ts
@ -10,7 +10,8 @@ export { N8nClient } from './clients/n8n-client';
 export type { WorkflowResponse, WorkflowNodeResponse, ExecutionDetail } from './clients/n8n-client';

 // -- Test case data --
-export { loadWorkflowTestCases } from './data/workflows';
+export { loadWorkflowTestCasesWithFiles } from './data/workflows';
+export type { WorkflowTestCaseWithFile } from './data/workflows';

 // -- Credentials --
 export { seedCredentials, cleanupCredentials } from './credentials/seeder';
--- a/packages/@n8n/instance-ai/evaluations/langsmith/dataset-sync.ts
+++ b/packages/@n8n/instance-ai/evaluations/langsmith/dataset-sync.ts
@ -0,0 +1,323 @@
+// ---------------------------------------------------------------------------
+// LangSmith dataset sync
+//
+// Syncs JSON test case files from the repo to a LangSmith dataset.
+// Uses derived IDs (fileSlug/scenarioName) so examples are stable across
+// runs, enabling experiment comparison over time.
+// ---------------------------------------------------------------------------
+
+import { createHash } from 'crypto';
+import type { Client } from 'langsmith';
+import type { Example, KVMap } from 'langsmith/schemas';
+import { z } from 'zod';
+
+import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
+import type { EvalLogger } from '../harness/logger';
+
+// Bump this if existing IDs get tombstoned by LangSmith soft-delete and need
+// to be regenerated fresh. UUIDs for the same derivedId stay stable within a
+// version, so experiment comparison still works.
+const UUID_VERSION = 'v2';
+
+/**
+ * Generate a deterministic UUID from a string.
+ * Same input always produces the same UUID, so example IDs are stable across runs.
+ */
+function deterministicUuid(input: string): string {
+	const hash = createHash('sha256').update(`${UUID_VERSION}:${input}`).digest('hex');
+	// Format as UUID v4 shape (8-4-4-4-12)
+	return [
+		hash.slice(0, 8),
+		hash.slice(8, 12),
+		'4' + hash.slice(13, 16),
+		'8' + hash.slice(17, 20),
+		hash.slice(20, 32),
+	].join('-');
+}
+
+/**
+ * Shape of the inputs passed to the target function for each scenario.
+ * `testCaseFile` is included so the LangSmith Inputs column shows which
+ * workflow a scenario belongs to (metadata is hidden by default).
+ */
+export const datasetExampleInputsSchema = z.object({
+	prompt: z.string(),
+	testCaseFile: z.string(),
+	scenarioName: z.string(),
+	scenarioDescription: z.string(),
+	dataSetup: z.string(),
+	successCriteria: z.string(),
+});
+export type DatasetExampleInputs = z.infer<typeof datasetExampleInputsSchema>;
+
+/** Metadata attached to each example for filtering / grouping in the UI. */
+export const datasetExampleMetadataSchema = z.object({
+	/** Duplicated from inputs so the LangSmith UI can group by it (only metadata keys are groupable). */
+	testCaseFile: z.string(),
+	complexity: z.enum(['simple', 'medium', 'complex']).optional(),
+	tags: z.array(z.string()).optional(),
+	triggerType: z.enum(['manual', 'webhook', 'schedule', 'form']).optional(),
+});
+export type DatasetExampleMetadata = z.infer<typeof datasetExampleMetadataSchema>;
+
+/**
+ * Sync JSON test cases to a LangSmith dataset.
+ *
+ * - Creates the dataset if it doesn't exist
+ * - Diffs local scenarios against existing examples
+ * - Creates, updates, or deletes examples to match
+ * - Orders examples round-robin across test cases for optimal parallelism
+ * - Assigns each example to a split (test case file slug) for UI filtering
+ *
+ * Returns the dataset name for use with evaluate().
+ */
+export async function syncDataset(
+	lsClient: Client,
+	datasetName: string,
+	logger: EvalLogger,
+	filter?: string,
+): Promise<string> {
+	const testCasesWithFiles = loadWorkflowTestCasesWithFiles(filter);
+
+	// Round-robin ordering ensures evaluate() triggers diverse builds early
+	// rather than burning all concurrency slots on one test case.
+	const scenarios = buildRoundRobinScenarios(testCasesWithFiles);
+
+	logger.info(
+		`Dataset sync: ${String(scenarios.length)} scenarios from ${String(testCasesWithFiles.length)} test cases`,
+	);
+
+	// Create or get dataset. `hasDataset` distinguishes "not found" from auth/
+	// network errors, so we only create when it genuinely doesn't exist.
+	let datasetId: string;
+	if (await lsClient.hasDataset({ datasetName })) {
+		const dataset = await lsClient.readDataset({ datasetName });
+		datasetId = dataset.id;
+	} else {
+		const dataset = await lsClient.createDataset(datasetName, {
+			description: 'Instance AI workflow execution evaluations (synced from repo JSON files)',
+		});
+		datasetId = dataset.id;
+		logger.info(`Created dataset: ${datasetName}`);
+	}
+
+	// List existing examples, keyed by derived ID (testCaseFile/scenarioName from inputs).
+	const existingByDerivedId = new Map<string, Example>();
+	for await (const example of lsClient.listExamples({ datasetId })) {
+		const inputs = existingInputsSchema.safeParse(example.inputs);
+		if (!inputs.success) continue;
+		existingByDerivedId.set(`${inputs.data.testCaseFile}/${inputs.data.scenarioName}`, example);
+	}
+
+	// Diff and sync
+	const currentIds = new Set<string>();
+	const toCreate: Array<{ id: string; inputs: KVMap; metadata: KVMap; split: string }> = [];
+	const toUpdate: Array<{ id: string; inputs: KVMap; metadata: KVMap; split: string }> = [];
+
+	for (const scenario of scenarios) {
+		const derivedId = `${scenario.testCaseFile}/${scenario.scenarioName}`;
+		currentIds.add(derivedId);
+
+		const inputs: DatasetExampleInputs = {
+			prompt: scenario.prompt,
+			testCaseFile: scenario.testCaseFile,
+			scenarioName: scenario.scenarioName,
+			scenarioDescription: scenario.scenarioDescription,
+			dataSetup: scenario.dataSetup,
+			successCriteria: scenario.successCriteria,
+		};
+
+		const metadata: DatasetExampleMetadata = {
+			testCaseFile: scenario.testCaseFile,
+			complexity: scenario.complexity,
+			tags: scenario.tags,
+			triggerType: scenario.triggerType,
+		};
+
+		const existingExample = existingByDerivedId.get(derivedId);
+		if (existingExample) {
+			if (
+				hasInputsChanged(existingExample.inputs, inputs) ||
+				hasMetadataChanged(existingExample.metadata, metadata)
+			) {
+				toUpdate.push({
+					id: existingExample.id,
+					inputs,
+					metadata,
+					split: scenario.testCaseFile,
+				});
+			}
+		} else {
+			toCreate.push({
+				id: deterministicUuid(derivedId),
+				inputs,
+				metadata,
+				split: scenario.testCaseFile,
+			});
+		}
+	}
+
+	// Only delete stale examples on a full sync (no filter). With a filter,
+	// we're only syncing a subset and mustn't delete the others.
+	// LangSmith also soft-deletes, which tombstones the UUID and prevents
+	// recreation with the same ID on a later full run.
+	const toDelete: string[] = [];
+	if (!filter) {
+		for (const [derivedId, example] of existingByDerivedId) {
+			if (!currentIds.has(derivedId)) {
+				toDelete.push(example.id);
+			}
+		}
+	}
+
+	if (toCreate.length > 0) {
+		await lsClient.createExamples(
+			toCreate.map((e) => ({
+				id: e.id,
+				inputs: e.inputs,
+				metadata: e.metadata,
+				split: e.split,
+				dataset_id: datasetId,
+			})),
+		);
+		logger.info(`  Created ${String(toCreate.length)} example(s)`);
+	}
+
+	if (toUpdate.length > 0) {
+		await lsClient.updateExamples(
+			toUpdate.map((e) => ({
+				id: e.id,
+				inputs: e.inputs,
+				metadata: e.metadata,
+				split: e.split,
+				dataset_id: datasetId,
+			})),
+		);
+		logger.info(`  Updated ${String(toUpdate.length)} example(s)`);
+	}
+
+	if (toDelete.length > 0) {
+		await lsClient.deleteExamples(toDelete);
+		logger.info(`  Deleted ${String(toDelete.length)} stale example(s)`);
+	}
+
+	if (toCreate.length === 0 && toUpdate.length === 0 && toDelete.length === 0) {
+		logger.info('  Dataset up to date');
+	}
+
+	return datasetName;
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+interface FlatScenario {
+	prompt: string;
+	testCaseFile: string;
+	scenarioName: string;
+	scenarioDescription: string;
+	dataSetup: string;
+	successCriteria: string;
+	complexity?: 'simple' | 'medium' | 'complex';
+	tags?: string[];
+	triggerType?: 'manual' | 'webhook' | 'schedule' | 'form';
+}
+
+/**
+ * Flatten test cases into scenarios ordered round-robin across test cases.
+ *
+ * Input:  [tc1(s1,s2,s3), tc2(s1,s2), tc3(s1)]
+ * Output: [tc1/s1, tc2/s1, tc3/s1, tc1/s2, tc2/s2, tc1/s3]
+ */
+function buildRoundRobinScenarios(
+	testCasesWithFiles: Array<{
+		testCase: {
+			prompt: string;
+			complexity?: 'simple' | 'medium' | 'complex';
+			tags?: string[];
+			triggerType?: 'manual' | 'webhook' | 'schedule' | 'form';
+			scenarios: Array<{
+				name: string;
+				description: string;
+				dataSetup: string;
+				successCriteria: string;
+			}>;
+		};
+		fileSlug: string;
+	}>,
+): FlatScenario[] {
+	const result: FlatScenario[] = [];
+	const maxScenarios = Math.max(...testCasesWithFiles.map((tc) => tc.testCase.scenarios.length), 0);
+
+	for (let i = 0; i < maxScenarios; i++) {
+		for (const { testCase, fileSlug } of testCasesWithFiles) {
+			const scenario = testCase.scenarios[i];
+			if (scenario) {
+				result.push({
+					prompt: testCase.prompt,
+					testCaseFile: fileSlug,
+					scenarioName: scenario.name,
+					scenarioDescription: scenario.description,
+					dataSetup: scenario.dataSetup,
+					successCriteria: scenario.successCriteria,
+					complexity: testCase.complexity,
+					tags: testCase.tags,
+					triggerType: testCase.triggerType,
+				});
+			}
+		}
+	}
+
+	return result;
+}
+
+// Schemas for reading existing LangSmith example data, which is typed as an
+// open KVMap by the SDK. We only parse the fields we care about for diffing.
+
+const existingInputsSchema = z
+	.object({
+		prompt: z.string().default(''),
+		testCaseFile: z.string().default(''),
+		scenarioName: z.string().default(''),
+		scenarioDescription: z.string().default(''),
+		dataSetup: z.string().default(''),
+		successCriteria: z.string().default(''),
+	})
+	.passthrough();
+
+const existingMetadataSchema = z
+	.object({
+		testCaseFile: z.string().default(''),
+		complexity: z.string().default(''),
+		triggerType: z.string().default(''),
+		tags: z.array(z.string()).default([]),
+	})
+	.passthrough();
+
+function hasInputsChanged(existing: unknown, incoming: DatasetExampleInputs): boolean {
+	// Treat unparseable existing data as changed so we overwrite with fresh
+	// values rather than aborting the whole sync on one malformed row.
+	const parsed = existingInputsSchema.safeParse(existing ?? {});
+	if (!parsed.success) return true;
+	const e = parsed.data;
+	return (
+		e.prompt !== incoming.prompt ||
+		e.testCaseFile !== incoming.testCaseFile ||
+		e.dataSetup !== incoming.dataSetup ||
+		e.successCriteria !== incoming.successCriteria ||
+		e.scenarioDescription !== incoming.scenarioDescription
+	);
+}
+
+function hasMetadataChanged(existing: unknown, incoming: DatasetExampleMetadata): boolean {
+	const parsed = existingMetadataSchema.safeParse(existing ?? {});
+	if (!parsed.success) return true;
+	const e = parsed.data;
+	return (
+		e.testCaseFile !== incoming.testCaseFile ||
+		e.complexity !== (incoming.complexity ?? '') ||
+		e.triggerType !== (incoming.triggerType ?? '') ||
+		JSON.stringify(e.tags) !== JSON.stringify(incoming.tags ?? [])
+	);
+}
--- a/packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
+++ b/packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
@ -0,0 +1,501 @@
+/**
+ * HTML report generator for workflow test case evaluations.
+ *
+ * Produces a self-contained HTML file optimized for three tasks:
+ * 1. Triage — which scenarios failed? (seconds)
+ * 2. Diagnose — why did they fail? (minutes)
+ * 3. Compare — what changed between runs? (cross-report)
+ */
+
+import fs from 'fs';
+import path from 'path';
+
+import type { WorkflowTestCaseResult, ScenarioResult } from '../types';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function escapeHtml(str: string): string {
+	return str
+		.replace(/&/g, '&amp;')
+		.replace(/</g, '&lt;')
+		.replace(/>/g, '&gt;')
+		.replace(/"/g, '&quot;')
+		.replace(/'/g, '&#39;');
+}
+
+// ---------------------------------------------------------------------------
+// Scenario rendering
+// ---------------------------------------------------------------------------
+
+function renderScenario(sr: ScenarioResult, index: number): string {
+	const icon = sr.success ? '&#10003;' : '&#10007;';
+	const statusClass = sr.success ? 'pass' : 'fail';
+
+	// Passing scenarios: compact one-liner with collapsible detail
+	if (sr.success) {
+		const summary = sr.reasoning ? sr.reasoning.slice(0, 150) : 'All checks passed';
+		return `<div class="scenario ${statusClass}">
+			<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
+				<span class="scenario-icon ${statusClass}">${icon}</span>
+				<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
+				<span class="scenario-summary-inline">${escapeHtml(summary)}${sr.reasoning && sr.reasoning.length > 150 ? '...' : ''}</span>
+			</div>
+			<div class="scenario-detail" id="scenario-${String(index)}">
+				${renderScenarioDetail(sr)}
+			</div>
+		</div>`;
+	}
+
+	// Failing scenarios: show error prominently, detail expanded by default
+	return `<div class="scenario ${statusClass} expanded">
+		<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
+			<span class="scenario-icon ${statusClass}">${icon}</span>
+			<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
+			<span class="scenario-desc">${escapeHtml(sr.scenario.description)}</span>
+		</div>
+		<div class="scenario-detail" id="scenario-${String(index)}">
+			${renderScenarioDetail(sr)}
+		</div>
+	</div>`;
+}
+
+function renderScenarioDetail(sr: ScenarioResult): string {
+	let html = '';
+
+	if (!sr.evalResult) {
+		if (sr.reasoning) {
+			html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
+		}
+		return html;
+	}
+
+	// Failure category badge
+	if (!sr.success && sr.failureCategory) {
+		const catClass =
+			sr.failureCategory === 'builder_issue'
+				? 'warn'
+				: sr.failureCategory === 'mock_issue'
+					? 'fail'
+					: 'info';
+		html += `<div class="category-badge category-${catClass}">${escapeHtml(sr.failureCategory)}${sr.rootCause ? ': ' + escapeHtml(sr.rootCause) : ''}</div>`;
+	}
+
+	// 1. Error — what broke
+	if (sr.evalResult.errors.length > 0) {
+		html += `<div class="error-box">${escapeHtml(sr.evalResult.errors.join('; '))}</div>`;
+	}
+
+	// Phase 1 warnings
+	const warnings = sr.evalResult.hints?.warnings ?? [];
+	if (warnings.length > 0) {
+		html += `<div class="warning-box">${escapeHtml(warnings.join('; '))}</div>`;
+	}
+
+	// 2. Diagnosis — verifier's reasoning
+	if (sr.reasoning) {
+		html += '<details class="section" open><summary>Diagnosis</summary>';
+		html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
+		html += '</details>';
+	}
+
+	// 3. Mock data plan — Phase 1 hints
+	if (sr.evalResult.hints) {
+		html += '<details class="section"><summary>Mock data plan</summary>';
+		const { globalContext, triggerContent, nodeHints } = sr.evalResult.hints;
+
+		if (globalContext) {
+			html += '<div class="subsection-label">Global context</div>';
+			html += `<div class="hint-text">${escapeHtml(globalContext)}</div>`;
+		}
+
+		if (Object.keys(triggerContent ?? {}).length > 0) {
+			html += '<div class="subsection-label">Trigger content</div>';
+			html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(triggerContent, null, 2))}</code></pre>`;
+		} else {
+			html +=
+				'<div class="warning-inline">No trigger content generated \u2014 start node has no input data</div>';
+		}
+
+		if (nodeHints && Object.keys(nodeHints).length > 0) {
+			html += '<div class="subsection-label">Per-node hints</div>';
+			for (const [nodeName, hint] of Object.entries(nodeHints)) {
+				html += `<details class="node-hint"><summary>${escapeHtml(nodeName)}</summary>`;
+				html += `<div class="hint-text">${escapeHtml(hint)}</div>`;
+				html += '</details>';
+			}
+		}
+		html += '</details>';
+	}
+
+	// 4. Execution trace — per-node results
+	const nodeEntries = Object.entries(sr.evalResult.nodeResults);
+	if (nodeEntries.length > 0) {
+		html += '<details class="section"><summary>Execution trace</summary>';
+		html +=
+			'<div class="trace-legend"><span class="node-mode-mocked">mocked</span> <span class="node-mode-pinned">pinned</span> <span class="node-mode-real">real</span></div>';
+
+		for (const [nodeName, nr] of nodeEntries) {
+			const modeClass = `node-mode-${nr.executionMode}`;
+			const hasError = nr.configIssues && Object.keys(nr.configIssues).length > 0;
+			const configWarning = hasError
+				? `<span class="build-issue">Build issue: ${escapeHtml(Object.values(nr.configIssues!).flat().join('; '))}</span>`
+				: '';
+
+			html += '<div class="trace-node">';
+			html += '<div class="trace-node-header">';
+			html += `<span class="${modeClass}">[${nr.executionMode}]</span> <strong>${escapeHtml(nodeName)}</strong>`;
+			if (nr.interceptedRequests.length > 0) {
+				html += ` <span class="request-count">${String(nr.interceptedRequests.length)} request(s)</span>`;
+			}
+			html += '</div>';
+			if (configWarning) html += configWarning;
+
+			// Intercepted requests
+			for (const req of nr.interceptedRequests) {
+				html += '<div class="request-pair">';
+				html += '<div class="request-header">Request sent</div>';
+				html += `<div class="request-method">${escapeHtml(req.method)} ${escapeHtml(req.url)}</div>`;
+				if (req.requestBody) {
+					html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.requestBody, null, 2))}</code></pre>`;
+				}
+				html += '<div class="response-header">Mock returned</div>';
+				if (req.mockResponse) {
+					html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.mockResponse, null, 2))}</code></pre>`;
+				} else {
+					html += '<div class="muted">no mock response</div>';
+				}
+				html += '</div>';
+			}
+
+			// Node output
+			if (nr.output !== null && nr.output !== undefined) {
+				html += '<details class="node-output-toggle"><summary>Node output</summary>';
+				html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(nr.output, null, 2))}</code></pre>`;
+				html += '</details>';
+			} else {
+				html += '<div class="muted">no output</div>';
+			}
+
+			html += '</div>';
+		}
+		html += '</details>';
+	}
+
+	return html;
+}
+
+// ---------------------------------------------------------------------------
+// Workflow summary
+// ---------------------------------------------------------------------------
+
+function renderWorkflowSummary(result: WorkflowTestCaseResult): string {
+	const firstEval = result.scenarioResults[0]?.evalResult;
+
+	let nodesHtml = '';
+	if (firstEval) {
+		const nodes = Object.entries(firstEval.nodeResults);
+		if (nodes.length > 0) {
+			const nodeList = nodes
+				.map(([name, nr]) => {
+					const mode = nr.executionMode;
+					const requests = nr.interceptedRequests.length;
+					const issues = nr.configIssues ? Object.values(nr.configIssues).flat().join('; ') : '';
+					let line = `<span class="node-mode-${mode}">[${mode}]</span> ${escapeHtml(name)}`;
+					if (requests > 0) line += ` <span class="muted">(${String(requests)} req)</span>`;
+					if (issues)
+						line += ` <span class="build-issue">Build issue: ${escapeHtml(issues)}</span>`;
+					return `<li>${line}</li>`;
+				})
+				.join('');
+			nodesHtml = `<details class="section"><summary>Built workflow (${String(nodes.length)} nodes)</summary><ul class="node-list">${nodeList}</ul></details>`;
+		}
+	}
+
+	let jsonHtml = '';
+	if (result.workflowJson) {
+		const raw = JSON.stringify(result.workflowJson, null, 2);
+		jsonHtml = `<details class="section"><summary>Agent output (raw JSON)</summary><pre class="json-block"><code>${escapeHtml(raw)}</code></pre></details>`;
+	}
+
+	return nodesHtml + jsonHtml;
+}
+
+// ---------------------------------------------------------------------------
+// Test case rendering
+// ---------------------------------------------------------------------------
+
+function renderTestCase(result: WorkflowTestCaseResult, tcIndex: number): string {
+	const passCount = result.scenarioResults.filter((sr) => sr.success).length;
+	const totalCount = result.scenarioResults.length;
+	const allPass = passCount === totalCount && totalCount > 0;
+	const statusClass = result.workflowBuildSuccess ? (allPass ? 'pass' : 'mixed') : 'fail';
+
+	const buildBadge = result.workflowBuildSuccess
+		? '<span class="badge badge-pass">BUILT</span>'
+		: '<span class="badge badge-fail">BUILD FAILED</span>';
+
+	const scoreBadge =
+		totalCount > 0
+			? `<span class="badge badge-${allPass ? 'pass' : 'fail'}">${String(passCount)}/${String(totalCount)}</span>`
+			: '';
+
+	const prompt = result.testCase.prompt;
+	const truncatedPrompt = prompt.length > 100 ? prompt.slice(0, 100) + '...' : prompt;
+
+	// Inline scenario indicators for quick triage without expanding
+	const scenarioIndicators = result.scenarioResults
+		.map(
+			(sr) =>
+				`<span class="scenario-indicator ${sr.success ? 'pass' : 'fail'}" title="${escapeHtml(sr.scenario.name)}">${sr.success ? '✓' : '✗'} ${escapeHtml(sr.scenario.name)}</span>`,
+		)
+		.join(' ');
+
+	let scenariosHtml = '';
+	if (result.scenarioResults.length > 0) {
+		scenariosHtml = result.scenarioResults
+			.map((sr, i) => renderScenario(sr, tcIndex * 100 + i))
+			.join('');
+	} else if (!result.workflowBuildSuccess) {
+		const errorDetail = result.buildError
+			? `<div class="error-box">${escapeHtml(result.buildError)}</div>`
+			: '';
+		scenariosHtml = `<div class="muted">Workflow failed to build — no scenarios executed</div>${errorDetail}`;
+	}
+
+	return `<div class="test-case ${statusClass}">
+		<div class="test-case-header" onclick="this.parentElement.classList.toggle('expanded')">
+			<div class="test-case-title">
+				${buildBadge} ${scoreBadge}
+				<span class="test-case-prompt">${escapeHtml(truncatedPrompt)}</span>
+			</div>
+			<div class="test-case-meta">
+				<span class="badge badge-tag">${escapeHtml(result.testCase.complexity)}</span>
+				${result.workflowId ? `<span class="workflow-id">${escapeHtml(result.workflowId)}</span>` : ''}
+			</div>
+			<div class="scenario-indicators">${scenarioIndicators}</div>
+		</div>
+		<div class="test-case-detail">
+			<details class="section"><summary>Prompt</summary><div class="prompt-text">${escapeHtml(prompt)}</div></details>
+			${renderWorkflowSummary(result)}
+			${scenariosHtml}
+		</div>
+	</div>`;
+}
+
+// ---------------------------------------------------------------------------
+// Full report
+// ---------------------------------------------------------------------------
+
+export function generateWorkflowReport(results: WorkflowTestCaseResult[]): string {
+	const totalTestCases = results.length;
+	const builtCount = results.filter((r) => r.workflowBuildSuccess).length;
+	const allScenarios = results.flatMap((r) => r.scenarioResults);
+	const passCount = allScenarios.filter((sr) => sr.success).length;
+	const failCount = allScenarios.length - passCount;
+	const totalScenarios = allScenarios.length;
+	const passRate = totalScenarios > 0 ? Math.round((passCount / totalScenarios) * 100) : 0;
+
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Workflow evaluation report</title>
+<style>
+	:root {
+		--bg-primary: #0d1117;
+		--bg-secondary: #161b22;
+		--bg-tertiary: #1c2129;
+		--border: #30363d;
+		--border-light: #21262d;
+		--text-primary: #f0f6fc;
+		--text-secondary: #c9d1d9;
+		--text-muted: #8b949e;
+		--color-pass: #3fb950;
+		--color-fail: #f85149;
+		--color-warn: #d29922;
+		--color-info: #58a6ff;
+		--color-purple: #bc8cff;
+		--color-pass-bg: #23863622;
+		--color-fail-bg: #da363322;
+		--color-warn-bg: #d2992222;
+	}
+
+	* { margin: 0; padding: 0; box-sizing: border-box; }
+	body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: var(--bg-primary); color: var(--text-secondary); padding: 24px; max-width: 1400px; margin: 0 auto; font-size: 14px; line-height: 1.5; }
+
+	/* Header */
+	h1 { color: var(--text-primary); font-size: 20px; margin-bottom: 2px; }
+	.subtitle { color: var(--text-muted); font-size: 13px; margin-bottom: 20px; }
+
+	/* Dashboard */
+	.dashboard { display: flex; gap: 12px; margin-bottom: 24px; flex-wrap: wrap; align-items: stretch; }
+	.stat-card { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; padding: 14px 20px; min-width: 120px; }
+	.stat-card .label { color: var(--text-muted); font-size: 12px; }
+	.stat-card .value { color: var(--text-primary); font-size: 26px; font-weight: 700; margin-top: 2px; }
+	.stat-card .value.pass { color: var(--color-pass); }
+	.stat-card .value.fail { color: var(--color-fail); }
+	.stat-card .value.mixed { color: var(--color-warn); }
+
+	/* Toolbar */
+	.toolbar { display: flex; gap: 8px; margin-bottom: 16px; }
+	.toolbar button { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 6px; color: var(--text-secondary); padding: 6px 12px; font-size: 12px; cursor: pointer; }
+	.toolbar button:hover { background: var(--bg-tertiary); color: var(--text-primary); }
+	.toolbar button.active { border-color: var(--color-info); color: var(--color-info); }
+
+	/* Badges */
+	.badge { display: inline-block; padding: 2px 8px; border-radius: 12px; font-size: 11px; font-weight: 600; margin-right: 4px; }
+	.badge-pass { background: var(--color-pass-bg); color: var(--color-pass); }
+	.badge-fail { background: var(--color-fail-bg); color: var(--color-fail); }
+	.badge-tag { background: var(--border); color: var(--text-muted); }
+
+	/* Test case cards */
+	.test-case { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; margin-bottom: 10px; overflow: hidden; }
+	.test-case.pass { border-left: 3px solid var(--color-pass); }
+	.test-case.fail { border-left: 3px solid var(--color-fail); }
+	.test-case.mixed { border-left: 3px solid var(--color-warn); }
+	.test-case-header { padding: 12px 16px; cursor: pointer; }
+	.test-case-header:hover { background: var(--bg-tertiary); }
+	.test-case-title { display: flex; align-items: center; gap: 8px; margin-bottom: 4px; }
+	.test-case-prompt { color: var(--text-primary); font-weight: 500; font-size: 13px; }
+	.test-case-meta { display: flex; align-items: center; gap: 6px; margin-bottom: 6px; }
+	.workflow-id { color: var(--text-muted); font-size: 11px; font-family: monospace; }
+	.scenario-indicators { display: flex; gap: 8px; flex-wrap: wrap; }
+	.scenario-indicator { font-size: 11px; font-family: monospace; }
+	.scenario-indicator.pass { color: var(--color-pass); }
+	.scenario-indicator.fail { color: var(--color-fail); }
+	.test-case-detail { display: none; padding: 0 16px 16px; }
+	.test-case.expanded .test-case-detail { display: block; }
+
+	/* Sections (collapsible) */
+	.section { margin: 8px 0; }
+	.section > summary { cursor: pointer; color: var(--color-info); font-size: 12px; font-weight: 600; padding: 4px 0; }
+	.section > summary:hover { text-decoration: underline; }
+
+	/* Scenarios */
+	.scenario { border: 1px solid var(--border-light); border-radius: 6px; margin-bottom: 6px; overflow: hidden; }
+	.scenario-header { padding: 8px 12px; cursor: pointer; display: flex; align-items: center; gap: 8px; font-size: 13px; }
+	.scenario-header:hover { background: var(--bg-tertiary); }
+	.scenario-icon { font-weight: bold; font-size: 14px; min-width: 16px; }
+	.scenario-icon.pass { color: var(--color-pass); }
+	.scenario-icon.fail { color: var(--color-fail); }
+	.scenario-name { color: var(--text-primary); font-weight: 600; }
+	.scenario-desc { color: var(--text-muted); font-size: 12px; }
+	.scenario-summary-inline { color: var(--text-muted); font-size: 12px; flex: 1; }
+	.scenario-detail { display: none; padding: 10px 12px; border-top: 1px solid var(--border-light); background: var(--bg-primary); }
+	.scenario.expanded .scenario-detail { display: block; }
+
+	/* Error and warning boxes */
+	.error-box { color: var(--color-fail); font-size: 12px; padding: 6px 10px; background: var(--color-fail-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-fail); }
+	.warning-box { color: var(--color-warn); font-size: 12px; padding: 6px 10px; background: var(--color-warn-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-warn); }
+	.warning-inline { color: var(--color-warn); font-size: 11px; margin: 4px 0; }
+	.build-issue { color: var(--color-warn); font-size: 11px; display: block; margin-top: 2px; }
+
+	/* Diagnosis */
+	.diagnosis { color: var(--text-secondary); font-size: 12px; line-height: 1.6; padding: 6px 0; }
+
+	/* Prompt */
+	.prompt-text { color: var(--text-secondary); font-size: 13px; line-height: 1.6; padding: 10px; background: var(--bg-primary); border: 1px solid var(--border); border-radius: 6px; white-space: pre-wrap; }
+
+	/* Execution trace */
+	.trace-legend { font-size: 11px; margin-bottom: 8px; display: flex; gap: 12px; }
+	.trace-node { border: 1px solid var(--border-light); border-radius: 4px; margin-bottom: 6px; padding: 8px; }
+	.trace-node-header { font-size: 12px; font-family: monospace; margin-bottom: 4px; }
+	.request-count { color: var(--text-muted); font-size: 11px; }
+
+	/* Request/response pairs */
+	.request-pair { border: 1px solid var(--border-light); border-radius: 4px; margin: 6px 0; overflow: hidden; }
+	.request-header { background: #1c3a5e; color: var(--color-info); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
+	.response-header { background: #2a1f3e; color: var(--color-purple); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
+	.request-method { font-size: 11px; color: var(--text-primary); padding: 4px 8px; font-family: monospace; font-weight: 600; background: var(--bg-primary); }
+
+	/* JSON blocks */
+	.json-block { font-size: 11px; margin: 4px 0; padding: 8px; background: var(--bg-secondary); border: 1px solid var(--border-light); border-radius: 4px; overflow-x: auto; }
+	.json-sm { font-size: 10px; }
+	pre { overflow-x: auto; margin: 0; }
+	code { color: var(--text-secondary); }
+
+	/* Node list */
+	.node-list { list-style: none; padding: 4px 0; font-size: 12px; font-family: monospace; }
+	.node-list li { padding: 3px 0; }
+	.node-mode-mocked { color: var(--color-info); font-weight: 600; }
+	.node-mode-pinned { color: var(--color-warn); font-weight: 600; }
+	.node-mode-real { color: var(--color-pass); font-weight: 600; }
+
+	/* Node output toggle */
+	.node-output-toggle { margin: 4px 0; }
+	.node-output-toggle > summary { cursor: pointer; color: var(--text-muted); font-size: 11px; }
+
+	/* Node hint */
+	.node-hint { margin: 2px 0; }
+	.node-hint > summary { cursor: pointer; color: var(--text-secondary); font-size: 11px; font-family: monospace; }
+	.hint-text { color: var(--text-muted); font-size: 11px; padding: 4px 0; line-height: 1.5; }
+	.subsection-label { color: var(--text-primary); font-size: 11px; font-weight: 600; margin-top: 8px; margin-bottom: 2px; }
+
+	/* Category badges */
+	.category-badge { font-size: 11px; font-weight: 600; padding: 4px 10px; border-radius: 4px; margin-bottom: 8px; }
+	.category-warn { background: var(--color-warn-bg); color: var(--color-warn); border-left: 3px solid var(--color-warn); }
+	.category-fail { background: var(--color-fail-bg); color: var(--color-fail); border-left: 3px solid var(--color-fail); }
+	.category-info { background: #1c3a5e33; color: var(--color-info); border-left: 3px solid var(--color-info); }
+
+	/* Utilities */
+	.muted { color: var(--text-muted); font-size: 12px; }
+</style>
+</head>
+<body>
+
+<h1>Workflow evaluation report</h1>
+<p class="subtitle">Generated ${new Date().toLocaleString()} &mdash; ${String(totalScenarios)} scenarios across ${String(totalTestCases)} test cases</p>
+
+<div class="dashboard">
+	<div class="stat-card">
+		<div class="label">Pass rate</div>
+		<div class="value${passRate >= 80 ? ' pass' : passRate >= 50 ? ' mixed' : ' fail'}">${String(passRate)}%</div>
+	</div>
+	<div class="stat-card">
+		<div class="label">Passed</div>
+		<div class="value pass">${String(passCount)}</div>
+	</div>
+	<div class="stat-card">
+		<div class="label">Failed</div>
+		<div class="value${failCount > 0 ? ' fail' : ''}">${String(failCount)}</div>
+	</div>
+	<div class="stat-card">
+		<div class="label">Built</div>
+		<div class="value${builtCount === totalTestCases ? ' pass' : ' mixed'}">${String(builtCount)}/${String(totalTestCases)}</div>
+	</div>
+</div>
+
+<div class="toolbar">
+	<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.add('expanded'))">Expand all</button>
+	<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.remove('expanded'))">Collapse all</button>
+	<button onclick="document.querySelectorAll('.test-case').forEach(e => { e.style.display = e.classList.contains('pass') ? 'none' : '' }); this.classList.toggle('active')">Show failures only</button>
+</div>
+
+${results.map((r, i) => renderTestCase(r, i)).join('')}
+
+</body>
+</html>`;
+}
+
+// ---------------------------------------------------------------------------
+// Write report to disk
+// ---------------------------------------------------------------------------
+
+export function writeWorkflowReport(results: WorkflowTestCaseResult[]): string {
+	const reportDir = path.join(__dirname, '..', '..', '.data');
+	if (!fs.existsSync(reportDir)) {
+		fs.mkdirSync(reportDir, { recursive: true });
+	}
+	const html = generateWorkflowReport(results);
+	const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
+	const reportPath = path.join(reportDir, `workflow-eval-${timestamp}.html`);
+	fs.writeFileSync(reportPath, html);
+
+	// Also write to the stable filename for quick access
+	fs.writeFileSync(path.join(reportDir, 'workflow-eval-report.html'), html);
+
+	return reportPath;
+}