Merge branch 'master' into node-4885-add-ms-service-principal-credentials

2026-05-12 16:10:30 +02:00 · 2026-05-06 09:54:10 +01:00 · 2026-05-06 09:54:10 +01:00 · bafea0d898
commit bafea0d898
parent d6c53a3753 80c8a6c2fd
142 changed files with 10618 additions and 2363 deletions
--- a/.claude/plugins/n8n/skills/community-pr-review/SKILL.md
+++ b/.claude/plugins/n8n/skills/community-pr-review/SKILL.md
@ -0,0 +1,150 @@
+---
+description: >-
+  Checks if a community pull request is ready for human review. Verifies CLA
+  signature, PR title format, description completeness, test coverage, and
+  cubic-dev-ai issues. Use when given a PR number or branch name to review,
+  or when the user says /community-pr-review, /pr-review, or asks to check if
+  a PR is ready for review.
+allowed-tools: Bash(gh:*), Bash(git:*), Read, Glob, Grep
+---
+
+# Community PR Review
+
+Given a PR number or branch name, determine whether it is ready for human review.
+
+## Steps
+
+### 1. Resolve the PR
+
+If given a branch name, find the PR number first:
+```bash
+gh pr view <branch> --repo n8n-io/n8n --json number --jq .number
+```
+
+### 2. Fetch PR data
+
+```bash
+gh pr view <number> --repo n8n-io/n8n \
+  --json number,title,body,author,headRefName,headRefOid,files,isDraft,state
+```
+
+Fetch in parallel:
+
+```bash
+# CLA commit status (primary signal) — statuses are newest-first; use the first returned entry
+gh api --paginate "repos/n8n-io/n8n/commits/<headRefOid>/statuses" \
+  --jq '[.[] | select(.context == "license/cla") | {state, description}] | first'
+
+# CLAassistant issue comment (fallback when no commit status) — use the last returned entry
+gh api --paginate "repos/n8n-io/n8n/issues/<number>/comments" \
+  --jq '[.[] | select(.user.login == "CLAassistant") | .body] | last'
+
+# cubic-dev-ai PR review comments (streamed so results concatenate cleanly across pages)
+gh api --paginate "repos/n8n-io/n8n/pulls/<number>/comments" \
+  --jq '.[] | select(.user.login == "cubic-dev-ai[bot]") | {body: .body, path: .path}'
+```
+
+### 3. Run the five checks
+
+#### A. CLA signed
+
+Check the `license/cla` commit status first; fall back to the CLAassistant comment if no status exists.
+
+**Commit status** (`context == "license/cla"`):
+- `state: "success"` → ✅ signed
+- `state: "failure"` or `state: "error"` → ❌ not signed
+- `state: "pending"` → ⏳ pending
+- Not present → fall back to comment
+
+**CLAassistant issue comment** (fallback):
+- Body contains `"All committers have signed the CLA."` → ✅ signed
+- Body contains `"not signed"` or a link to sign → ❌ not signed
+- No comment → ❌ treat as not signed
+
+#### B. PR title format
+
+For all types except `revert`, the title must match:
+```
+^(feat|fix|perf|test|docs|refactor|build|ci|chore)(\([a-zA-Z0-9 ]+( Node)?\))?!?: [A-Z].+[^.]$
+```
+
+For `revert` titles, the summary is the original commit header (which starts with a lowercase type), so capitalization is not enforced:
+```
+^revert(\([a-zA-Z0-9 ]+( Node)?\))?!?: .+[^.]$
+```
+
+- Type must be one of: `feat fix perf test docs refactor build ci chore revert`
+- Scope is optional, in parentheses e.g. `(editor)` or `(Slack Node)`
+- Breaking changes: `!` before the colon
+- Summary: starts with capital letter (lowercase allowed for `revert:`), no trailing period
+- No Linear ticket IDs in the title (e.g. `N8N-1234`)
+
+#### C. PR description completeness
+
+1. **Summary** (`## Summary`) — must have non-empty content below the heading (not just the HTML comment).
+2. **Related tickets** (`## Related Linear tickets, Github issues, and Community forum posts`) — acceptable content: a URL (`http`), a GitHub closing keyword (`closes #N`, `fixes #N`, `resolves #N`, etc.), or empty. Only flag if the section heading is missing entirely.
+3. **Checklist** (`## Review / Merge checklist`) — all four items must be present. Unchecked checkboxes are expected for community PRs; do **not** flag them as missing.
+
+#### D. Tests
+
+Skip this check if the PR type (from the title) is `docs`, `ci`, `chore`, or `build`.
+
+Otherwise:
+1. Identify source files changed: non-test files under `packages/` from the `files` list.
+2. If there are source file changes, check out the PR in a temporary worktree:
+
+```bash
+git fetch origin pull/<number>/head:pr/<number>
+git worktree add /tmp/pr-<number>-review pr/<number>
+```
+
+3. Read the changed source files from the worktree to understand whether the changes introduce logic that warrants tests (new functions, bug fixes, behaviour changes, data transformations). Pure config changes, type-only changes, and trivial renames do not require tests.
+4. Look for matching test files (`*.test.ts`, `*.spec.ts`, files inside `__tests__/`) among the changed files.
+5. **Always clean up the worktree**, even if a previous check failed:
+
+```bash
+git worktree remove /tmp/pr-<number>-review --force
+git branch -D pr/<number>
+```
+
+Report:
+- ✅ Tests present, or change does not require tests
+- ❌ Source logic changed but no test files found
+
+#### E. cubic-dev-ai issues
+
+Review the PR review comments fetched in step 2. `cubic-dev-ai[bot]` leaves comments for every issue it finds.
+
+- No comments from `cubic-dev-ai[bot]`, or every comment explicitly states no issues were found → ✅
+- Any other comment → ❌ report the total count and priority breakdown (e.g. "3 issues: 1× P1, 1× P2, 1× P3")
+
+### 4. Output
+
+Always output valid JSON in this exact shape:
+
+```json
+{
+  "readyForReview": <true if all passing checks allow merge, false otherwise>,
+  "messageForUser": "<Human-readable summary of what needs to change, written as if posted directly to the PR contributor. 'N/A' if nothing is needed.>",
+  "checks": {
+    "CLA": <true if signed, false if not signed or pending>,
+    "Title": <true if title matches convention, false otherwise>,
+    "Description": <true if all three template sections are complete, false otherwise>,
+    "TestsNeeded": <true if the code changes require tests, false if not applicable>,
+    "TestsIncluded": <true if test files are present in the PR, false otherwise>,
+    "CubicIssues": <true if cubic-dev-ai raised issues, false if no issues>
+  }
+}
+```
+
+`readyForReview` is `true` only when: `CLA`, `Title`, and `Description` are all `true`; `CubicIssues` is `false`; and either `TestsNeeded` is `false` or `TestsIncluded` is `true`.
+
+`messageForUser` should be a short, friendly message directed at the contributor listing exactly what they need to address. If `readyForReview` is `true`, set it to `"N/A"`.
+
+Output nothing other than the JSON block.
+
+## Notes
+
+- Draft PRs — report all findings but note the PR is a draft.
+- If the PR is already merged or closed, say so and skip the checks.
+- Always remove the worktree even if earlier checks failed.
--- a/.github/workflows/test-evals-instance-ai.yml
+++ b/.github/workflows/test-evals-instance-ai.yml
@ -143,7 +143,7 @@ jobs:
            --base-url "$BASE_URLS" \
            --concurrency 32 \
            --verbose \
-            --iterations 3 \
+            --iterations 5 \
            ${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }}

      - name: Stop n8n containers
@ -160,22 +160,16 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          RESULTS_FILE="packages/@n8n/instance-ai/eval-results.json"
-          if [ ! -f "$RESULTS_FILE" ]; then
-            echo "No eval results file found"
+          # The eval CLI writes the full PR comment as eval-pr-comment.md
+          # (see comparison/format.ts:formatComparisonMarkdown). It includes
+          # the alert, aggregate, comparison sections, per-test-case results
+          # collapsed, and failure details collapsed. CI just relays it.
+          COMMENT_FILE="packages/@n8n/instance-ai/eval-pr-comment.md"
+          if [ ! -f "$COMMENT_FILE" ]; then
+            echo "No PR comment file found (eval likely cancelled before writing results)"
            exit 0
          fi
-
-          # Build the full comment body with jq
-          jq -r '
-            "### Instance AI Workflow Eval Results\n\n" +
-            "**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)% | iterations: \(.summary.passRatePerIter)**\n\n" +
-            "| Workflow | Build | pass@\(.totalRuns) | pass^\(.totalRuns) |\n|---|---|---|---|\n" +
-            ([.testCases[] as $tc | "| \($tc.name) | \($tc.buildSuccessCount)/\($tc.totalRuns) | \(([$tc.scenarios[] | .passAtK] | add) / ($tc.scenarios | length) * 100 | floor)% | \(([$tc.scenarios[] | .passHatK] | add) / ($tc.scenarios | length) * 100 | floor)% |"] | join("\n")) +
-            "\n\n<details><summary>Failure details</summary>\n\n" +
-            ([.testCases[] as $tc | $tc.scenarios[] | select(.passHatK < 1) | "**\($tc.name) / \(.name)** — \(.passCount)/\(.totalRuns) passed" + "\n" + ([.runs[] | select(.passed == false) | "> Run\(if .failureCategory then " [\(.failureCategory)]" else "" end): \(.reasoning | .[0:200])"] | join("\n"))] | join("\n\n")) +
-            "\n</details>"
-          ' "$RESULTS_FILE" > /tmp/eval-comment.md
+          cp "$COMMENT_FILE" /tmp/eval-comment.md

          # Find and update existing eval comment, or create new one
          COMMENT_ID=$(gh api "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,7 @@ packages/testing/playwright/playwright-report
 packages/testing/playwright/test-results
 packages/testing/playwright/eval-results.json
 packages/@n8n/instance-ai/eval-results.json
+packages/@n8n/instance-ai/eval-pr-comment.md
 packages/testing/playwright/.playwright-browsers
 packages/testing/playwright/.playwright-cli
 test-results/
--- a/packages/@n8n/ai-workflow-builder.ee/src/code-builder/handlers/parse-validate-handler.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/src/code-builder/handlers/parse-validate-handler.ts
@ -195,8 +195,8 @@ export class ParseValidateHandler {
 				builder.generatePinData({ beforeWorkflow: currentWorkflow });
 			}

-			// Convert to JSON
-			const workflowJson: WorkflowJSON = builder.toJSON();
+			// Convert to JSON with Dagre layout matching the FE's tidy-up
+			const workflowJson: WorkflowJSON = builder.toJSON({ tidyUp: true });

 			this.logger?.debug('Parsed workflow', {
 				id: workflowJson.id,
--- a/packages/@n8n/api-types/src/index.ts
+++ b/packages/@n8n/api-types/src/index.ts
@ -276,6 +276,7 @@ export {
 	toolResultPayloadSchema,
 	toolErrorPayloadSchema,
 	confirmationRequestPayloadSchema,
+	confirmationInputTypeSchema,
 	credentialRequestSchema,
 	workflowSetupNodeSchema,
 	errorPayloadSchema,
@ -284,6 +285,7 @@ export {
 	mcpToolCallRequestSchema,
 	mcpToolCallResultSchema,
 	getRenderHint,
+	isDisplayableConfirmationRequest,
 	isSafeObjectKey,
 	DEFAULT_INSTANCE_AI_PERMISSIONS,
 	UNLIMITED_CREDITS,
@ -316,6 +318,8 @@ export type {
 	InstanceAiEventType,
 	InstanceAiRunStatus,
 	InstanceAiConfirmation,
+	InstanceAiConfirmationInputType,
+	InstanceAiConfirmationRequestPayload,
 	InstanceAiConfirmationSeverity,
 	InstanceAiCredentialRequest,
 	InstanceAiAgentStatus,
--- a/packages/@n8n/api-types/src/schemas/tests/instance-ai.schema.test.ts
+++ b/packages/@n8n/api-types/src/schemas/tests/instance-ai.schema.test.ts
@ -1,6 +1,9 @@
 import {
 	applyBranchReadOnlyOverrides,
 	DEFAULT_INSTANCE_AI_PERMISSIONS,
+	isDisplayableConfirmationRequest,
+	type InstanceAiConfirmationInputType,
+	type InstanceAiConfirmationRequestPayload,
 	type InstanceAiPermissions,
 } from '../instance-ai.schema';

@ -53,3 +56,178 @@ describe('applyBranchReadOnlyOverrides', () => {
 		expect(original.createWorkflow).toBe('require_approval');
 	});
 });
+
+function makeConfirmation(
+	overrides: Partial<InstanceAiConfirmationRequestPayload> = {},
+): InstanceAiConfirmationRequestPayload {
+	return {
+		requestId: 'req-1',
+		toolCallId: 'tc-1',
+		toolName: 'tool',
+		args: {},
+		severity: 'info',
+		message: 'Please approve',
+		...overrides,
+	};
+}
+
+describe('isDisplayableConfirmationRequest', () => {
+	it('treats approval and text messages as displayable', () => {
+		expect(isDisplayableConfirmationRequest(makeConfirmation({ inputType: 'approval' }))).toBe(
+			true,
+		);
+		expect(isDisplayableConfirmationRequest(makeConfirmation({ inputType: 'text' }))).toBe(true);
+	});
+
+	it('does not treat metadata-only approval prompts as displayable', () => {
+		expect(isDisplayableConfirmationRequest(makeConfirmation({ message: '   ' }))).toBe(false);
+	});
+
+	it('does not treat intro-only questions prompts as displayable', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'questions',
+					message: '',
+					introMessage: 'A little context before the questions',
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('recognizes typed display variants', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'questions',
+					message: '',
+					questions: [{ id: 'q1', question: 'Pick one', type: 'single', options: ['A'] }],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					planItems: [{ id: 'task-1', title: 'Task', kind: 'delegate', spec: 'Do it', deps: [] }],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'resource-decision',
+					message: '',
+					resourceDecision: {
+						toolGroup: 'filesystem',
+						resource: '/tmp',
+						description: 'Access /tmp',
+						options: ['allowForSession'],
+					},
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					setupRequests: [
+						{
+							node: {
+								name: 'Webhook',
+								type: 'n8n-nodes-base.webhook',
+								typeVersion: 1,
+								parameters: {},
+								position: [0, 0],
+								id: 'node-1',
+							},
+							isTrigger: true,
+						},
+					],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					credentialRequests: [
+						{ credentialType: 'httpBasicAuth', reason: 'Required', existingCredentials: [] },
+					],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					domainAccess: { url: 'https://example.com', host: 'example.com' },
+				}),
+			),
+		).toBe(true);
+	});
+
+	it('does not treat credential flow metadata as displayable on its own', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					credentialFlow: { stage: 'finalize' },
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('does not treat lightweight task lists as displayable plan reviews', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					tasks: {
+						tasks: [{ id: 'task-1', description: 'Do it', status: 'todo' }],
+					},
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('recognizes only renderable task args for plan reviews', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					args: {
+						tasks: [{ id: 'task-1', title: 'Task', kind: 'delegate', spec: 'Do it', deps: [] }],
+					},
+				}),
+			),
+		).toBe(true);
+
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					args: {
+						tasks: [{ id: 'task-1', description: 'Do it', status: 'todo' }],
+					},
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('keeps the input type switch exhaustive', () => {
+		const handled = {
+			approval: true,
+			text: true,
+			questions: true,
+			'plan-review': true,
+			'resource-decision': true,
+		} satisfies Record<InstanceAiConfirmationInputType, true>;
+
+		expect(Object.keys(handled)).toHaveLength(5);
+	});
+});
--- a/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
+++ b/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
@ -297,6 +297,15 @@ export type GatewayConfirmationRequiredPayload = z.infer<

 // ---------------------------------------------------------------------------

+export const confirmationInputTypeSchema = z.enum([
+	'approval',
+	'text',
+	'questions',
+	'plan-review',
+	'resource-decision',
+]);
+export type InstanceAiConfirmationInputType = z.infer<typeof confirmationInputTypeSchema>;
+
 export const confirmationRequestPayloadSchema = z.object({
 	requestId: z.string(),
 	inputThreadId: z
@ -315,8 +324,7 @@ export const confirmationRequestPayloadSchema = z.object({
 		.describe(
 			'Target project ID — used to scope actions (e.g. credential creation) to the correct project',
 		),
-	inputType: z
-		.enum(['approval', 'text', 'questions', 'plan-review', 'resource-decision'])
+	inputType: confirmationInputTypeSchema
 		.optional()
 		.describe(
 			'UI mode: approval (default) shows approve/deny, text shows a text input, ' +
@ -359,6 +367,53 @@ export const confirmationRequestPayloadSchema = z.object({
 		.optional()
 		.describe('Gateway resource-access decision data (inputType=resource-decision)'),
 });
+export type InstanceAiConfirmationRequestPayload = z.infer<typeof confirmationRequestPayloadSchema>;
+
+function isNonEmptyString(value: unknown): value is string {
+	return typeof value === 'string' && value.trim().length > 0;
+}
+
+function hasItems<T>(items: T[] | undefined): items is [T, ...T[]] {
+	return Array.isArray(items) && items.length > 0;
+}
+
+function argsContainPlannedTasks(args: Record<string, unknown>): boolean {
+	const tasks = args.tasks;
+	if (!Array.isArray(tasks)) return false;
+
+	return tasks.some((task) => plannedTaskArgSchema.safeParse(task).success);
+}
+
+function assertNever(value: never): never {
+	throw new Error(`Unhandled confirmation input type: ${String(value)}`);
+}
+
+/**
+ * True when the current frontend has enough typed confirmation payload to show
+ * a meaningful waiting-for-user UI. Correlation metadata alone must not count.
+ */
+export function isDisplayableConfirmationRequest(
+	payload: InstanceAiConfirmationRequestPayload,
+): boolean {
+	if (hasItems(payload.setupRequests)) return true;
+	if (hasItems(payload.credentialRequests)) return true;
+	if (payload.domainAccess) return true;
+
+	const inputType = payload.inputType ?? 'approval';
+	switch (inputType) {
+		case 'approval':
+		case 'text':
+			return isNonEmptyString(payload.message);
+		case 'questions':
+			return hasItems(payload.questions);
+		case 'plan-review':
+			return hasItems(payload.planItems) || argsContainPlannedTasks(payload.args);
+		case 'resource-decision':
+			return payload.resourceDecision !== undefined;
+		default:
+			return assertNever(inputType);
+	}
+}

 export const statusPayloadSchema = z.object({
 	message: z.string().describe('Transient status message. Empty string clears the indicator.'),
--- a/packages/@n8n/eslint-plugin-community-nodes/README.md
+++ b/packages/@n8n/eslint-plugin-community-nodes/README.md
@ -61,6 +61,7 @@ export default [
 | [no-restricted-globals](docs/rules/no-restricted-globals.md)                             | Disallow usage of restricted global variables in community nodes.                                                                           | ✅    |      |    |    |    |
 | [no-restricted-imports](docs/rules/no-restricted-imports.md)                             | Disallow usage of restricted imports in community nodes.                                                                                    | ✅    |      |    |    |    |
 | [no-runtime-dependencies](docs/rules/no-runtime-dependencies.md)                         | Disallow non-empty "dependencies" in community node package.json                                                                            | ✅ ☑️ |      |    |    |    |
+| [no-template-placeholders](docs/rules/no-template-placeholders.md)                       | Disallow unresolved template placeholders in package.json                                                                                   | ✅ ☑️ |      |    |    |    |
 | [node-class-description-icon-missing](docs/rules/node-class-description-icon-missing.md) | Node class description must have an `icon` property defined. Deprecated: use `require-node-description-fields` instead.                     |      |      |    | 💡 | ❌  |
 | [node-connection-type-literal](docs/rules/node-connection-type-literal.md)               | Disallow string literals in node description `inputs`/`outputs` — use `NodeConnectionTypes` enum instead                                    | ✅ ☑️ |      | 🔧 |    |    |
 | [node-operation-error-itemindex](docs/rules/node-operation-error-itemindex.md)           | Require { itemIndex } in NodeOperationError / NodeApiError options inside item loops                                                        | ✅ ☑️ |      |    |    |    |
--- a/packages/@n8n/eslint-plugin-community-nodes/docs/rules/no-template-placeholders.md
+++ b/packages/@n8n/eslint-plugin-community-nodes/docs/rules/no-template-placeholders.md
@ -0,0 +1,51 @@
+# Disallow unresolved template placeholders in package.json (`@n8n/community-nodes/no-template-placeholders`)
+
+💼 This rule is enabled in the following configs: ✅ `recommended`, ☑️ `recommendedWithoutN8nCloudSupport`.
+
+<!-- end auto-generated rule header -->
+
+## Rule Details
+
+Community node packages are typically scaffolded from a starter template that contains
+placeholder values such as `<PACKAGE_NAME>`, `<USERNAME>`, or `{{ authorName }}`. When
+these placeholders survive into a published `package.json`, the package metadata is
+broken — the name is invalid, the repository link is dead, etc.
+
+This rule scans every string value in `package.json` and reports any value containing
+an unresolved placeholder pattern. It catches:
+
+- Angle bracket placeholders: `<...>`
+- Mustache placeholders: `{{...}}`
+
+The rule applies to **all** string fields, including custom ones — not just the well-known
+fields like `name`, `description`, `homepage`, or `repository.url`.
+
+## Examples
+
+### Incorrect
+
+```json
+{
+  "name": "n8n-nodes-<PACKAGE_NAME>",
+  "description": "An n8n community node for {{service}}",
+  "homepage": "https://github.com/<USERNAME>/n8n-nodes-example#readme",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/<USERNAME>/n8n-nodes-example.git"
+  }
+}
+```
+
+### Correct
+
+```json
+{
+  "name": "n8n-nodes-acme",
+  "description": "An n8n community node for the Acme API",
+  "homepage": "https://github.com/acme/n8n-nodes-acme#readme",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/acme/n8n-nodes-acme.git"
+  }
+}
+```
--- a/packages/@n8n/eslint-plugin-community-nodes/src/plugin.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/plugin.ts
@ -33,6 +33,7 @@ const configs = {
 			'@n8n/community-nodes/no-http-request-with-manual-auth': 'error',
 			'@n8n/community-nodes/no-overrides-field': 'error',
 			'@n8n/community-nodes/no-runtime-dependencies': 'error',
+			'@n8n/community-nodes/no-template-placeholders': 'error',
 			'@n8n/community-nodes/icon-validation': 'error',
 			'@n8n/community-nodes/options-sorted-alphabetically': 'warn',
 			'@n8n/community-nodes/resource-operation-pattern': 'warn',
@ -67,6 +68,7 @@ const configs = {
 			'@n8n/community-nodes/no-http-request-with-manual-auth': 'error',
 			'@n8n/community-nodes/no-overrides-field': 'error',
 			'@n8n/community-nodes/no-runtime-dependencies': 'error',
+			'@n8n/community-nodes/no-template-placeholders': 'error',
 			'@n8n/community-nodes/icon-validation': 'error',
 			'@n8n/community-nodes/options-sorted-alphabetically': 'warn',
 			'@n8n/community-nodes/credential-documentation-url': 'error',
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/index.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/index.ts
@ -15,6 +15,7 @@ import { NoOverridesFieldRule } from './no-overrides-field.js';
 import { NoRestrictedGlobalsRule } from './no-restricted-globals.js';
 import { NoRestrictedImportsRule } from './no-restricted-imports.js';
 import { NoRuntimeDependenciesRule } from './no-runtime-dependencies.js';
+import { NoTemplatePlaceholdersRule } from './no-template-placeholders.js';
 import { NodeClassDescriptionIconMissingRule } from './node-class-description-icon-missing.js';
 import { NodeConnectionTypeLiteralRule } from './node-connection-type-literal.js';
 import { NodeOperationErrorItemIndexRule } from './node-operation-error-itemindex.js';
@ -45,6 +46,7 @@ export const rules = {
 	'no-http-request-with-manual-auth': NoHttpRequestWithManualAuthRule,
 	'no-overrides-field': NoOverridesFieldRule,
 	'no-runtime-dependencies': NoRuntimeDependenciesRule,
+	'no-template-placeholders': NoTemplatePlaceholdersRule,
 	'icon-validation': IconValidationRule,
 	'resource-operation-pattern': ResourceOperationPatternRule,
 	'credential-documentation-url': CredentialDocumentationUrlRule,
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-template-placeholders.test.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-template-placeholders.test.ts
@ -0,0 +1,135 @@
+import { RuleTester } from '@typescript-eslint/rule-tester';
+
+import { NoTemplatePlaceholdersRule } from './no-template-placeholders.js';
+
+const ruleTester = new RuleTester();
+
+ruleTester.run('no-template-placeholders', NoTemplatePlaceholdersRule, {
+	valid: [
+		{
+			name: 'package.json with no placeholders',
+			filename: 'package.json',
+			code: `{
+				"name": "n8n-nodes-example",
+				"version": "1.0.0",
+				"description": "An example community node",
+				"homepage": "https://example.com",
+				"repository": { "type": "git", "url": "git+https://github.com/acme/n8n-nodes-example.git" }
+			}`,
+		},
+		{
+			name: 'angle brackets that do not look like placeholders are ignored',
+			filename: 'package.json',
+			code: '{ "description": "Compares a < b values" }',
+		},
+		{
+			name: 'single curly braces are ignored',
+			filename: 'package.json',
+			code: '{ "description": "Use { key: value } syntax" }',
+		},
+		{
+			name: 'non-package.json file is ignored even if it has placeholders',
+			filename: 'tsconfig.json',
+			code: '{ "name": "<PACKAGE_NAME>" }',
+		},
+		{
+			name: 'numeric and boolean values are not flagged',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-example", "private": false, "engines": { "node": ">=18" } }',
+		},
+	],
+	invalid: [
+		{
+			name: 'angle bracket placeholder in name',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-<PACKAGE_NAME>" }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '<PACKAGE_NAME>' },
+				},
+			],
+		},
+		{
+			name: 'angle bracket placeholder in description',
+			filename: 'package.json',
+			code: '{ "description": "An n8n community node for <SERVICE>" }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '<SERVICE>' },
+				},
+			],
+		},
+		{
+			name: 'angle bracket placeholder in repository url',
+			filename: 'package.json',
+			code: '{ "repository": { "type": "git", "url": "git+https://github.com/<USERNAME>/n8n-nodes-example.git" } }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '<USERNAME>' },
+				},
+			],
+		},
+		{
+			name: 'angle bracket placeholder in homepage',
+			filename: 'package.json',
+			code: '{ "homepage": "https://github.com/<USERNAME>/n8n-nodes-example#readme" }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '<USERNAME>' },
+				},
+			],
+		},
+		{
+			name: 'mustache placeholder in author',
+			filename: 'package.json',
+			code: '{ "author": "{{ authorName }}" }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '{{ authorName }}' },
+				},
+			],
+		},
+		{
+			name: 'mustache placeholder inside larger string',
+			filename: 'package.json',
+			code: '{ "description": "Node by {{author}} for service" }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '{{author}}' },
+				},
+			],
+		},
+		{
+			name: 'placeholder in custom field',
+			filename: 'package.json',
+			code: '{ "n8n": { "n8nNodesApiVersion": 1, "credentials": ["<CREDENTIAL>"] } }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '<CREDENTIAL>' },
+				},
+			],
+		},
+		{
+			name: 'multiple placeholders in different fields are all reported',
+			filename: 'package.json',
+			code: '{ "name": "<NAME>", "description": "{{description}}" }',
+			errors: [
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '<NAME>' },
+				},
+				{
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: '{{description}}' },
+				},
+			],
+		},
+	],
+});
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-template-placeholders.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-template-placeholders.ts
@ -0,0 +1,68 @@
+import type { TSESTree } from '@typescript-eslint/utils';
+import { AST_NODE_TYPES } from '@typescript-eslint/utils';
+
+import { createRule } from '../utils/index.js';
+
+const ANGLE_PLACEHOLDER = /<[^<>\n]+?>/;
+const MUSTACHE_PLACEHOLDER = /\{\{[^{}\n]+?\}\}/;
+
+function findPlaceholder(value: string): { pattern: string; type: 'angle' | 'mustache' } | null {
+	const angleMatch = ANGLE_PLACEHOLDER.exec(value);
+	if (angleMatch) {
+		return { pattern: angleMatch[0], type: 'angle' };
+	}
+	const mustacheMatch = MUSTACHE_PLACEHOLDER.exec(value);
+	if (mustacheMatch) {
+		return { pattern: mustacheMatch[0], type: 'mustache' };
+	}
+	return null;
+}
+
+export const NoTemplatePlaceholdersRule = createRule({
+	name: 'no-template-placeholders',
+	meta: {
+		type: 'problem',
+		docs: {
+			description: 'Disallow unresolved template placeholders in package.json',
+		},
+		messages: {
+			unresolvedPlaceholder:
+				'String value contains an unresolved template placeholder "{{ pattern }}". Replace it with a real value before publishing.',
+		},
+		schema: [],
+	},
+	defaultOptions: [],
+	create(context) {
+		if (!context.filename.endsWith('package.json')) {
+			return {};
+		}
+
+		return {
+			Literal(node: TSESTree.Literal) {
+				if (typeof node.value !== 'string') {
+					return;
+				}
+
+				// Skip property keys — only flag values.
+				if (
+					node.parent?.type === AST_NODE_TYPES.Property &&
+					node.parent.key === node &&
+					!node.parent.computed
+				) {
+					return;
+				}
+
+				const placeholder = findPlaceholder(node.value);
+				if (!placeholder) {
+					return;
+				}
+
+				context.report({
+					node,
+					messageId: 'unresolvedPlaceholder',
+					data: { pattern: placeholder.pattern },
+				});
+			},
+		};
+	},
+});
--- a/packages/@n8n/instance-ai/docs/architecture.md
+++ b/packages/@n8n/instance-ai/docs/architecture.md
@ -382,14 +382,21 @@ The processor is configurable via `disableDeferredTools` flag.

 ## MCP Integration

-External MCP servers are connected via `McpClientManager`. Their tools are:
+External MCP servers are owned by `McpClientManager` (`mcp/mcp-client-manager.ts`).
+The cli's `InstanceAiService` holds one manager instance and passes it to
+`createInstanceAgent` via options; the agent factory calls
+`mcpManager.getRegularTools(mcpServers)` and
+`mcpManager.getBrowserTools(orchestrationContext?.browserMcpConfig)`. Tool
+descriptions are:

 1. **Schema-sanitized** for Anthropic compatibility (ZodNull → optional,
   discriminated unions → flattened objects, array types → recursive element fix)
 2. **Name-checked** against reserved domain tool names (prevents malicious
   shadowing of tools like `run-workflow`)
 3. **Separated** from domain tools in the orchestrator's tool set
-4. **Cached** by config hash across agent instances
+4. **Cached** by config hash inside the manager — the underlying `MCPClient`
+   instances are tracked so `mcpManager.disconnect()` (called during service
+   shutdown) closes SSE / stdio connections cleanly.

 Browser MCP tools (Chrome DevTools) are excluded from the orchestrator to avoid
 context bloat from screenshots. They're available to `browser-credential-setup`
--- a/packages/@n8n/instance-ai/docs/tools.md
+++ b/packages/@n8n/instance-ai/docs/tools.md
@ -209,9 +209,9 @@ are configured.

 ---

-## Workflow Tools (8–12)
+## Workflow Tools (9–13)

-Core count is 8; up to 4 more are conditionally registered based on license.
+Core count is 9; up to 4 more are conditionally registered based on license.

 ### `list-workflows`

@ -221,8 +221,11 @@ List workflows accessible to the current user.
 |-------|------|----------|---------|-------------|
 | `query` | string | no | — | Filter workflows by name |
 | `limit` | number | no | 50 | Max results (1–100) |
+| `status` | `"active" \| "archived" \| "all"` | no | `"active"` | Which workflows to list |

-**Returns**: `{ workflows: [{ id, name, active, createdAt, updatedAt }] }`
+**Returns**: `{ workflows: [{ id, name, activeVersionId, isArchived, createdAt, updatedAt }] }`
+
+`activeVersionId` is `null` when the workflow is unpublished.

 ### `get-workflow`

@ -232,7 +235,9 @@ Get full workflow definition including nodes, connections, and settings.
 |-------|------|----------|-------------|
 | `workflowId` | string | yes | Workflow ID |

-**Returns**: `{ id, name, active, nodes, connections, settings }`
+**Returns**: `{ id, name, activeVersionId, isArchived, nodes, connections, settings }`
+
+`activeVersionId` is `null` when the workflow is unpublished.

 ### `get-workflow-as-code`

@ -263,7 +268,8 @@ workflow JSON, applies layout engine positioning, resolves credentials.

 ### `delete-workflow`

-Archive a workflow (soft delete, deactivates if needed).
+Archive a workflow (soft delete, deactivates if needed). This is reversible
+with `unarchive-workflow`.

 | Field | Type | Required | Description |
 |-------|------|----------|-------------|
@ -271,6 +277,16 @@ Archive a workflow (soft delete, deactivates if needed).

 **Returns**: `{ success: boolean }`

+### `unarchive-workflow`
+
+Restore an archived workflow without publishing it.
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `workflowId` | string | yes | Archived workflow to restore |
+
+**Returns**: `{ success: boolean }`
+
 ### `setup-workflow`

 Open the UI for per-node credential and parameter setup. Uses a suspend/resume
--- a/packages/@n8n/instance-ai/evaluations/README.md
+++ b/packages/@n8n/instance-ai/evaluations/README.md
@ -121,7 +121,7 @@ dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --iterations 3
 | `--base-url` | `http://localhost:5678` | n8n instance URL |
 | `--email` | E2E test owner | Override login email (or `N8N_EVAL_EMAIL`) |
 | `--password` | E2E test owner | Override login password (or `N8N_EVAL_PASSWORD`) |
-| `--timeout-ms` | `600000` | Per-test-case timeout |
+| `--timeout-ms` | `900000` | Per-test-case timeout |
 | `--output-dir` | cwd | Where to write `eval-results.json` |
 | `--dataset` | `instance-ai-workflow-evals` | LangSmith dataset name |
 | `--concurrency` | `16` | Max concurrent scenarios (builds are separately capped at 4) |
@ -155,6 +155,47 @@ Every run produces:

 **LangSmith caveat:** if `LANGSMITH_API_KEY` is set in `.env.local`, local runs also land in the shared `instance-ai-workflow-evals` dataset. Unset it (or run without `dotenvx`) to keep exploratory runs out of team results.

+## Regression detection
+
+When `LANGSMITH_API_KEY` is set, every eval run automatically compares its results against the most recent pinned baseline (any experiment whose name starts with `instance-ai-baseline-`). Two output files are written:
+
+- `eval-results.json` — structured data only, including `comparison.result` when a baseline was found.
+- `eval-pr-comment.md` — the full PR comment rendered as markdown, including the alert, aggregate, comparison sections, per-test-case results, and failure details. Always written; falls back to a no-baseline summary when no comparison ran.
+
+The CI PR-comment step uses `eval-pr-comment.md` as the entire comment body (no jq assembly in the workflow). The console output uses a separate aligned-text formatter — same data, no markdown noise in the terminal.
+
+### Refreshing the baseline
+
+There is no auto-refresh — refresh explicitly when you want a new reference point, ideally with high N for low noise:
+
+```bash
+# From packages/@n8n/instance-ai/, on master at the version you want to pin
+LANGSMITH_API_KEY=... dotenvx run -f ../../../.env.local -- \
+  pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
+```
+
+LangSmith appends a random suffix (e.g. `instance-ai-baseline-7abc1234`); the most recently started one becomes the comparison target on the next eval run. The comparison is silently skipped on the baseline-creation run itself.
+
+### How scenarios are tiered
+
+Each scenario lands in one of three regression tiers, evaluated in order of strictness:
+
+- **Regression** — high-confidence flag, gating-grade. The drop must be statistically significant (chance of seeing it by noise < 5%), at least 30 percentage points in size, and the baseline must have been reliable (≥ 70% pass rate).
+- **Soft regression** — looser bar for visibility on borderline cases. Looser confidence threshold (chance by noise < 20%), drop ≥ 15 percentage points, baseline ≥ 50%. Frequently natural variance — worth a glance only if your changes touch related code paths.
+- **Notable movement** — any scenario whose pass rate moved by ≥ 35 percentage points without reaching either flag tier. Pure visibility, no implication of cause.
+
+Other verdicts: `improvement` (PR significantly better, skips the reliability gate), `unreliable_baseline` (confident drop but baseline was too flaky to call a regression — surfaced but not flagged), `stable`, `insufficient_data`.
+
+Why these tiers and not a flat percentage threshold? At the small N PR runs use (typically 3 iterations), a flat threshold can't tell a real regression from coin-flip noise. The confidence cutoff filters out gaps that could plausibly happen by chance, and the reliability gate avoids chasing noise on already-flaky scenarios. Implementation lives in `comparison/statistics.ts` (Fisher's exact test for the confidence check, Wilson interval for the headline aggregate band). Tune the soft tier first if the false-positive rate looks off — keep the hard tier strict.
+
+### Failure-category drift
+
+When both sides captured per-trial `failureCategory` values, the comparison also surfaces a run-level table of category rates (PR vs baseline). A category is marked **notable** when its absolute rate delta is ≥ 5 percentage points _and_ the count change beyond what scenario-count scaling would predict is ≥ 3 trials. This catches cross-scenario shifts (e.g. mock-generation breaking, or a model getting weaker overall) that per-scenario flags can miss.
+
+### Best-effort
+
+Comparison is logged and skipped on any LangSmith failure — it never fails the eval. It is also skipped when no baseline experiment exists yet.
+
 ## Pairwise evals

 Pairwise evals score a built workflow against the dataset's `dos` / `donts`
--- a/packages/@n8n/instance-ai/evaluations/tests/comparison-compare.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/comparison-compare.test.ts
@ -0,0 +1,190 @@
+import { compareBuckets, type ExperimentBucket, type ScenarioCounts } from '../comparison/compare';
+
+function bucket(
+	name: string,
+	scenarios: ScenarioCounts[],
+	categories?: { totals: Record<string, number>; trialTotal: number },
+): ExperimentBucket {
+	return {
+		experimentName: name,
+		scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
+		failureCategoryTotals: categories?.totals,
+		trialTotal: categories?.trialTotal,
+	};
+}
+
+function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
+	return { testCaseFile: file, scenarioName: scenario, passed, total };
+}
+
+describe('compareBuckets', () => {
+	it('produces a clean intersection when both sides have the same scenarios', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 8, 10), s('weather', 'happy', 1, 10)]);
+		const base = bucket('master', [s('contact', 'happy', 9, 10), s('weather', 'happy', 0, 10)]);
+
+		const result = compareBuckets(pr, base);
+
+		expect(result.scenarios).toHaveLength(2);
+		expect(result.prOnly).toEqual([]);
+		expect(result.baselineOnly).toEqual([]);
+		expect(result.aggregate.intersectionSize).toBe(2);
+	});
+
+	it('flags scenarios only present on one side', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 5, 10)]);
+		const base = bucket('master', [s('contact', 'happy', 8, 10), s('weather', 'happy', 5, 10)]);
+
+		const result = compareBuckets(pr, base);
+
+		expect(result.scenarios).toHaveLength(1);
+		expect(result.scenarios[0].testCaseFile).toBe('contact');
+		expect(result.baselineOnly).toEqual([{ testCaseFile: 'weather', scenarioName: 'happy' }]);
+		expect(result.prOnly).toEqual([]);
+	});
+
+	it('aggregates only over the intersection, not over baseline-only or pr-only', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 10, 10)]);
+		const base = bucket('master', [s('contact', 'happy', 5, 10), s('other', 'happy', 0, 10)]);
+
+		const result = compareBuckets(pr, base);
+
+		expect(result.aggregate.prAggregatePassRate).toBe(1);
+		expect(result.aggregate.baselineAggregatePassRate).toBe(0.5);
+		expect(result.aggregate.intersectionSize).toBe(1);
+	});
+
+	it('sorts scenarios with regressions first, then improvements, then stable', () => {
+		const pr = bucket('pr', [
+			s('a', 'stable', 10, 10),
+			s('b', 'regression', 0, 10),
+			s('c', 'improvement', 10, 10),
+		]);
+		const base = bucket('master', [
+			s('a', 'stable', 10, 10),
+			s('b', 'regression', 10, 10),
+			s('c', 'improvement', 0, 10),
+		]);
+
+		const result = compareBuckets(pr, base);
+		expect(result.scenarios.map((sc) => sc.scenarioName)).toEqual([
+			'regression',
+			'improvement',
+			'stable',
+		]);
+	});
+
+	it('returns insufficient_data when one side has zero trials for a scenario', () => {
+		const pr = bucket('pr', [s('contact', 'happy', 0, 0)]);
+		const base = bucket('master', [s('contact', 'happy', 10, 10)]);
+
+		const result = compareBuckets(pr, base);
+		expect(result.scenarios[0].verdict).toBe('insufficient_data');
+	});
+
+	it('returns no failure-category drift when either side lacks category totals', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		expect(compareBuckets(pr, base).failureCategories).toEqual([]);
+	});
+
+	it('flags a category as notable when both rate and trial-count gaps clear the bars', () => {
+		// Haiku-style shift: framework_issue 0/290 → 9/145.
+		// Rate gap: 6.2pp ≥ 5pp ✓.  Expected PR count given baseline = 0 × (145/290) = 0; |9 − 0| = 9 ≥ 3 ✓.
+		const pr = bucket('pr', [s('a', 'happy', 50, 145)], {
+			totals: { framework_issue: 9 },
+			trialTotal: 145,
+		});
+		const base = bucket('master', [s('a', 'happy', 200, 290)], {
+			totals: { framework_issue: 0 },
+			trialTotal: 290,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		const fw = cats.find((c) => c.category === 'framework_issue');
+		expect(fw?.notable).toBe(true);
+	});
+
+	it('does not flag when the rate gap is below the 5pp bar', () => {
+		// 3/100 vs 2/100 = 1pp gap, count gap = 1 — neither bar cleared.
+		const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
+			totals: { mock_issue: 3 },
+			trialTotal: 100,
+		});
+		const base = bucket('master', [s('a', 'happy', 50, 100)], {
+			totals: { mock_issue: 2 },
+			trialTotal: 100,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		expect(cats.find((c) => c.category === 'mock_issue')?.notable).toBe(false);
+	});
+
+	it('does not flag when the rate gap is large but the count gap is tiny (small N guard)', () => {
+		// PR 1/3 vs baseline 0/270 — rate gap = 33pp ≥ 5pp, but expected PR count = 0
+		// and observed = 1, count gap = 1 < 3. Should NOT flag — single trial on small N.
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)], {
+			totals: { builder_issue: 1 },
+			trialTotal: 3,
+		});
+		const base = bucket('master', [s('a', 'happy', 270, 270)], {
+			totals: { builder_issue: 0 },
+			trialTotal: 270,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		expect(cats.find((c) => c.category === 'builder_issue')?.notable).toBe(false);
+	});
+
+	it('drops unknown categories with a console warning, keeps all known categories', () => {
+		const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)], {
+			totals: { '-': 5, builder_issue: 2 },
+			trialTotal: 10,
+		});
+		const base = bucket('master', [s('a', 'happy', 8, 10)], {
+			totals: { builder_issue: 1 },
+			trialTotal: 10,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		// All five known categories are always present (some at 0/0 — renderer
+		// drops those). The unknown `-` category is dropped here with a warning.
+		expect(cats.map((c) => c.category).sort()).toEqual([
+			'build_failure',
+			'builder_issue',
+			'framework_issue',
+			'mock_issue',
+			'verification_failure',
+		]);
+		expect(warn).toHaveBeenCalledWith(expect.stringContaining('"-"'));
+		warn.mockRestore();
+	});
+
+	it('sorts notable categories before non-notable, then by absolute delta', () => {
+		const pr = bucket('pr', [s('a', 'happy', 50, 100)], {
+			totals: { framework_issue: 10, mock_issue: 4, builder_issue: 25 },
+			trialTotal: 100,
+		});
+		const base = bucket('master', [s('a', 'happy', 50, 100)], {
+			totals: { framework_issue: 0, mock_issue: 3, builder_issue: 22 },
+			trialTotal: 100,
+		});
+		const cats = compareBuckets(pr, base).failureCategories;
+		// framework_issue is the only notable one (rate gap 10pp, count gap 10).
+		expect(cats[0].category).toBe('framework_issue');
+		expect(cats[0].notable).toBe(true);
+		expect(cats.slice(1).every((c) => !c.notable)).toBe(true);
+	});
+
+	it('accepts custom tiered thresholds for tests', () => {
+		const pr = bucket('pr', [s('a', 'happy', 5, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+
+		// Defaults: 5/10 vs 8/10 = -30pp drop, p ≈ 0.18 → soft_regression
+		// (passes soft maxPValue=0.20, soft minDelta=0.15, baseline 80% above soft 50%).
+		const defaults = compareBuckets(pr, base);
+		expect(defaults.scenarios[0].verdict).toBe('soft_regression');
+
+		// Stricter soft p-value cutoff excludes this case.
+		const stricter = compareBuckets(pr, base, {
+			soft: { maxPValue: 0.1, minDelta: 0.15, minBaselinePassRate: 0.5 },
+		});
+		expect(['stable', 'watch']).toContain(stricter.scenarios[0].verdict);
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/comparison-format.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/comparison-format.test.ts
@ -0,0 +1,458 @@
+import {
+	compareBuckets,
+	type ComparisonOutcome,
+	type ComparisonResult,
+	type ExperimentBucket,
+	type ScenarioCounts,
+} from '../comparison/compare';
+import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
+import type { MultiRunEvaluation, WorkflowTestCase, ScenarioResult } from '../types';
+
+function ok(result: ComparisonResult): ComparisonOutcome {
+	return { kind: 'ok', result };
+}
+
+function slugMap(evaluation: MultiRunEvaluation, slugs: string[]): Map<WorkflowTestCase, string> {
+	return new Map(evaluation.testCases.map((tc, i) => [tc.testCase, slugs[i] ?? 'unknown']));
+}
+
+function bucket(name: string, scenarios: ScenarioCounts[]): ExperimentBucket {
+	return {
+		experimentName: name,
+		scenarios: new Map(scenarios.map((s) => [`${s.testCaseFile}/${s.scenarioName}`, s])),
+	};
+}
+
+function s(file: string, scenario: string, passed: number, total: number): ScenarioCounts {
+	return { testCaseFile: file, scenarioName: scenario, passed, total };
+}
+
+/** Minimal evaluation fixture matching the shape format.ts reads. */
+function evaluation(
+	opts: {
+		totalRuns?: number;
+		testCases?: Array<{
+			prompt?: string;
+			buildSuccessCount?: number;
+			scenarios?: Array<{
+				name: string;
+				passCount: number;
+				passes: boolean[]; // per-iteration pass/fail
+				reasoning?: string;
+				failureCategory?: string;
+			}>;
+		}>;
+	} = {},
+): MultiRunEvaluation {
+	const totalRuns = opts.totalRuns ?? 3;
+	return {
+		totalRuns,
+		testCases: (opts.testCases ?? []).map((tc) => {
+			const testCase = {
+				prompt: tc.prompt ?? 'Test workflow prompt',
+				complexity: 'medium' as const,
+				tags: [],
+				scenarios: (tc.scenarios ?? []).map((sa) => ({
+					name: sa.name,
+					description: '',
+					dataSetup: '',
+					successCriteria: '',
+				})),
+			} as WorkflowTestCase;
+			const buildSuccessCount = tc.buildSuccessCount ?? totalRuns;
+			const scenarios = (tc.scenarios ?? []).map((sa) => ({
+				scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
+				passCount: sa.passCount,
+				passRate: totalRuns > 0 ? sa.passCount / totalRuns : 0,
+				passAtK: new Array(totalRuns).fill(sa.passCount > 0 ? 1 : 0) as number[],
+				passHatK: new Array(totalRuns).fill(sa.passCount === totalRuns ? 1 : 0) as number[],
+				runs: sa.passes.map(
+					(passed): ScenarioResult => ({
+						scenario: testCase.scenarios.find((sc) => sc.name === sa.name)!,
+						success: passed,
+						score: passed ? 1 : 0,
+						reasoning: sa.reasoning ?? '',
+						failureCategory: !passed ? sa.failureCategory : undefined,
+					}),
+				),
+			}));
+			return {
+				testCase,
+				workflowBuildSuccess: buildSuccessCount > 0,
+				scenarioResults: [],
+				scenarios,
+				runs: new Array(totalRuns).fill(null).map(() => ({
+					testCase,
+					workflowBuildSuccess: buildSuccessCount > 0,
+					scenarioResults: [],
+				})),
+				buildSuccessCount,
+			};
+		}),
+	};
+}
+
+describe('formatComparisonMarkdown', () => {
+	const evalFixture = evaluation({
+		totalRuns: 3,
+		testCases: [
+			{
+				prompt: 'a',
+				scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
+			},
+		],
+	});
+
+	it('renders heading, alert, aggregate, and a regression table', () => {
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+
+		expect(md).toMatch(/### Instance AI Workflow Eval/);
+		expect(md).toMatch(/> \[!CAUTION\]/);
+		expect(md).toMatch(/1 regression/);
+		expect(md).toMatch(/\*\*Aggregate\*\*: 0\.0% PR vs 100\.0% baseline/);
+		expect(md).toMatch(/#### Regressions \(1\)/);
+		expect(md).toMatch(/`a\/happy`/);
+		expect(md).toMatch(/0\/3 \(0%\)/);
+		expect(md).toMatch(/-100pp ↓/);
+	});
+
+	it('uses TIP alert when there are only improvements', () => {
+		const pr = bucket('pr', [s('a', 'happy', 3, 3)]);
+		const base = bucket('master', [s('a', 'happy', 0, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+
+		expect(md).toMatch(/> \[!TIP\]/);
+		expect(md).toMatch(/1 improvement/);
+		expect(md).toMatch(/#### Improvements \(1\)/);
+		expect(md).toMatch(/\+100pp ↑/);
+	});
+
+	it('uses TIP alert with "0 regressions" when everything is stable', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+
+		expect(md).toMatch(/> \[!TIP\]/);
+		expect(md).toMatch(/0 regressions/);
+		expect(md).toMatch(/1 stable/);
+		expect(md).not.toMatch(/#### Regressions/);
+	});
+
+	it('renders LangSmith-disabled NOTE when outcome is undefined', () => {
+		const md = formatComparisonMarkdown(evalFixture);
+		expect(md).toMatch(/> \[!NOTE\]/);
+		expect(md).toMatch(/LangSmith disabled/);
+		expect(md).not.toMatch(/#### Regressions/);
+	});
+
+	it('renders distinct alerts per skip reason', () => {
+		const noBase = formatComparisonMarkdown(evalFixture, { kind: 'no_baseline' });
+		expect(noBase).toMatch(/> \[!NOTE\]/);
+		expect(noBase).toMatch(/No baseline configured/);
+
+		const selfBase = formatComparisonMarkdown(evalFixture, {
+			kind: 'self_baseline',
+			experimentName: 'instance-ai-baseline-abc',
+		});
+		expect(selfBase).toMatch(/> \[!NOTE\]/);
+		expect(selfBase).toMatch(/This run is the baseline/);
+		expect(selfBase).toMatch(/instance-ai-baseline-abc/);
+
+		const fetchFail = formatComparisonMarkdown(evalFixture, {
+			kind: 'fetch_failed',
+			error: 'LangSmith 503',
+		});
+		// fetch_failed is a real outage, not a benign skip — must be a WARNING.
+		expect(fetchFail).toMatch(/> \[!WARNING\]/);
+		expect(fetchFail).toMatch(/Regression detection did not run/);
+		expect(fetchFail).toMatch(/LangSmith 503/);
+	});
+
+	it('shows mixed-case alert when both regressions and improvements exist', () => {
+		const pr = bucket('pr', [s('a', 'happy', 0, 3), s('b', 'happy', 3, 3)]);
+		const base = bucket('master', [s('a', 'happy', 10, 10), s('b', 'happy', 0, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/> \[!CAUTION\]/);
+		expect(md).toMatch(/1 regression/);
+		expect(md).toMatch(/1 improvement/);
+		expect(md).toMatch(/#### Regressions/);
+		expect(md).toMatch(/#### Improvements/);
+	});
+
+	it('embeds commit SHA in heading when provided', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)), {
+			commitSha: 'abc1234567890def',
+		});
+		expect(md).toMatch(/### Instance AI Workflow Eval — `abc12345`/);
+	});
+
+	it('marks new failure categories with 🆕', () => {
+		const pr: ExperimentBucket = {
+			experimentName: 'pr',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 0, 3) }]]),
+			failureCategoryTotals: { framework_issue: 9 },
+			trialTotal: 145,
+		};
+		const base: ExperimentBucket = {
+			experimentName: 'master',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 5, 10) }]]),
+			failureCategoryTotals: { framework_issue: 0 },
+			trialTotal: 290,
+		};
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/#### Failure breakdown/);
+		expect(md).toMatch(/`framework_issue` 🆕/);
+		expect(md).toMatch(/\*\*notable\*\*/);
+	});
+
+	it('always includes all five tier counts in the alert line', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10)]);
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/0 regressions, 0 soft, 0 notable, 0 improvements, 1 stable/);
+	});
+
+	it('renders a per-scenario breakdown collapsible inside the regression section', () => {
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'a',
+					scenarios: [
+						{
+							name: 'happy',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Builder produced an unsupported node configuration',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master', [s('a', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalWithFailures, ['a']),
+		});
+
+		expect(md).toMatch(/#### Regressions \(1\)/);
+		// The regression row's collapsible should appear inside the Regressions
+		// section, before the per-test-case section, and carry the same slug.
+		const regressionsIdx = md.indexOf('#### Regressions');
+		const perTcIdx = md.indexOf('Per-test-case results');
+		const breakdownIdx = md.indexOf('<code>a/happy</code>');
+		expect(breakdownIdx).toBeGreaterThan(regressionsIdx);
+		expect(breakdownIdx).toBeLessThan(perTcIdx);
+		expect(md).toMatch(/3 of 3 failed · 3× builder_issue/);
+		expect(md).toMatch(/Run 1 \[builder_issue\]: Builder produced/);
+	});
+
+	it('uses `file/scenario` slug headers in the bottom Failure details section', () => {
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'Build a cross-team Linear report digest',
+					scenarios: [
+						{
+							name: 'no-cross-team-issues',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'reason',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('cross-team-linear-report', 'no-cross-team-issues', 0, 3)]);
+		const base = bucket('master', [s('cross-team-linear-report', 'no-cross-team-issues', 10, 10)]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report']),
+		});
+
+		expect(md).toMatch(/<summary>Failure details<\/summary>/);
+		expect(md).toMatch(/\*\*`cross-team-linear-report\/no-cross-team-issues`\*\* — 3 failed/);
+	});
+
+	it('attaches per-scenario failures to the right file slug when names collide', () => {
+		// Two test cases each defining `happy-path`. Without the slug map,
+		// the renderer would conflate them — Albert's review flagged this
+		// exact bug. With the map, each row's collapsible carries only that
+		// row's failures.
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'cross-team prompt',
+					scenarios: [
+						{
+							name: 'happy-path',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Linear node misconfigured',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+				{
+					prompt: 'weather prompt',
+					scenarios: [
+						{
+							name: 'happy-path',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Weather mock returned empty',
+							failureCategory: 'mock_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [
+			s('cross-team-linear-report', 'happy-path', 0, 3),
+			s('weather-monitoring', 'happy-path', 0, 3),
+		]);
+		const base = bucket('master', [
+			s('cross-team-linear-report', 'happy-path', 10, 10),
+			s('weather-monitoring', 'happy-path', 10, 10),
+		]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalWithFailures, ['cross-team-linear-report', 'weather-monitoring']),
+		});
+
+		// Each per-scenario collapsible (under the regression table) must show
+		// ONLY its own failures. Slice each block at its closing </details>.
+		function collapsibleFor(slug: string): string {
+			const open = md.indexOf(`<code>${slug}</code>`);
+			expect(open).toBeGreaterThan(-1);
+			const close = md.indexOf('</details>', open);
+			return md.slice(open, close);
+		}
+		const crossTeamBlock = collapsibleFor('cross-team-linear-report/happy-path');
+		const weatherBlock = collapsibleFor('weather-monitoring/happy-path');
+		expect(crossTeamBlock).toMatch(/Linear node misconfigured/);
+		expect(crossTeamBlock).not.toMatch(/Weather mock returned empty/);
+		expect(weatherBlock).toMatch(/Weather mock returned empty/);
+		expect(weatherBlock).not.toMatch(/Linear node misconfigured/);
+	});
+
+	it('uses the slug instead of the prompt in the per-test-case table', () => {
+		const evalFx = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'Build a cross-team Linear report digest from open issues',
+					scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('cross-team-linear-report', 'happy', 0, 3)]);
+		const base = bucket('master', [s('cross-team-linear-report', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalFx, ok(compareBuckets(pr, base)), {
+			slugByTestCase: slugMap(evalFx, ['cross-team-linear-report']),
+		});
+
+		// Per-test-case table cell should be the slug, not the prompt.
+		const perTcSection = md.slice(md.indexOf('Per-test-case results'));
+		expect(perTcSection).toMatch(/`cross-team-linear-report`/);
+		expect(perTcSection).not.toMatch(/Build a cross-team Linear report digest/);
+	});
+
+	it('skips per-scenario breakdown when slugByTestCase is omitted', () => {
+		// Without the slug map, the renderer can't disambiguate. We'd rather
+		// drop the breakdown than show a wrong one.
+		const evalWithFailures = evaluation({
+			totalRuns: 3,
+			testCases: [
+				{
+					prompt: 'a',
+					scenarios: [
+						{
+							name: 'happy',
+							passCount: 0,
+							passes: [false, false, false],
+							reasoning: 'Some failure',
+							failureCategory: 'builder_issue',
+						},
+					],
+				},
+			],
+		});
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master', [s('a', 'happy', 10, 10)]);
+		const md = formatComparisonMarkdown(evalWithFailures, ok(compareBuckets(pr, base)));
+
+		// Regression table still rendered.
+		expect(md).toMatch(/#### Regressions \(1\)/);
+		// But no per-scenario collapsible (which would have used <code>a/happy</code>
+		// with the breakdown summary text).
+		expect(md).not.toMatch(/3 of 3 failed · 3× builder_issue/);
+	});
+
+	it('renders the failure breakdown for non-notable categories with non-zero counts', () => {
+		// 50/100 vs 50/100 — no scenario regression, but still has builder_issue
+		// counts on both sides (non-notable but non-zero).
+		const pr: ExperimentBucket = {
+			experimentName: 'pr',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
+			failureCategoryTotals: { builder_issue: 25 },
+			trialTotal: 100,
+		};
+		const base: ExperimentBucket = {
+			experimentName: 'master',
+			scenarios: new Map([['a/happy', { ...s('a', 'happy', 50, 100) }]]),
+			failureCategoryTotals: { builder_issue: 22 },
+			trialTotal: 100,
+		};
+		const md = formatComparisonMarkdown(evalFixture, ok(compareBuckets(pr, base)));
+		expect(md).toMatch(/#### Failure breakdown/);
+		expect(md).toMatch(/`builder_issue`/);
+		// builder_issue isn't notable here, so no "notable" marker.
+		expect(md).not.toMatch(/builder_issue.*notable/);
+	});
+});
+
+describe('formatComparisonTerminal', () => {
+	const evalFixture = evaluation({
+		totalRuns: 3,
+		testCases: [
+			{
+				prompt: 'a',
+				scenarios: [{ name: 'happy', passCount: 0, passes: [false, false, false] }],
+			},
+		],
+	});
+
+	it('renders title, verdict, aggregate, and regression table without markdown syntax', () => {
+		const pr = bucket('pr', [s('a', 'happy', 0, 3)]);
+		const base = bucket('master-abc', [s('a', 'happy', 10, 10)]);
+		const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
+		expect(out).toMatch(/^Instance AI Workflow Eval/);
+		expect(out).toMatch(/▶ 1 regression/);
+		expect(out).toMatch(/PR\s{8}0\.0%/);
+		expect(out).toMatch(/baseline\s{2}100\.0%/);
+		expect(out).toMatch(/REGRESSIONS/);
+		expect(out).toMatch(/a\/happy/);
+		expect(out).not.toMatch(/^###/m);
+		expect(out).not.toMatch(/\| /);
+	});
+
+	it('renders LangSmith-disabled message when outcome is undefined', () => {
+		const out = formatComparisonTerminal(evalFixture);
+		expect(out).toMatch(/LangSmith disabled/);
+		expect(out).not.toMatch(/REGRESSIONS/);
+	});
+
+	it('shows partial banner when scenarios differ on each side', () => {
+		const pr = bucket('pr', [s('a', 'happy', 8, 10)]);
+		const base = bucket('master', [s('a', 'happy', 8, 10), s('b', 'happy', 5, 10)]);
+		const out = formatComparisonTerminal(evalFixture, ok(compareBuckets(pr, base)));
+		expect(out).toMatch(/partial: 1 baseline scenarios not run by PR/);
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/comparison-statistics.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/comparison-statistics.test.ts
@ -0,0 +1,161 @@
+import {
+	classifyScenario,
+	fishersExactOneSidedLeft,
+	wilsonInterval,
+} from '../comparison/statistics';
+
+describe('fishersExactOneSidedLeft', () => {
+	it('returns 1 when either row is empty (no information)', () => {
+		expect(fishersExactOneSidedLeft(0, 0, 5, 5)).toBe(1);
+		expect(fishersExactOneSidedLeft(5, 5, 0, 0)).toBe(1);
+	});
+
+	it('returns 1 when no failures or no passes are observed (no test possible)', () => {
+		expect(fishersExactOneSidedLeft(3, 0, 5, 0)).toBe(1);
+		expect(fishersExactOneSidedLeft(0, 3, 0, 5)).toBe(1);
+	});
+
+	it('matches a known textbook case', () => {
+		// 2x2 table where PR (1/3) is much worse than baseline (10/10).
+		// Hypergeometric: P(X = 0) + P(X = 1) | drawn=3 from passes=11, fails=2
+		// = C(11,0)C(2,3)/C(13,3) + C(11,1)C(2,2)/C(13,3)
+		// = 0 + 11/286 ≈ 0.03846
+		const p = fishersExactOneSidedLeft(1, 2, 10, 0);
+		expect(p).toBeCloseTo(0.03846, 4);
+	});
+
+	it('returns p = 1 when PR pass rate equals baseline at maximum', () => {
+		// PR all pass, baseline all pass — under H0 the observed PR is the most likely outcome,
+		// so the left-tail (X ≤ a) p-value is exactly 1.
+		const p = fishersExactOneSidedLeft(5, 0, 5, 0);
+		expect(p).toBe(1);
+	});
+
+	it('detects a strong regression with high N', () => {
+		// PR 0/10, baseline 10/10 — extremely strong evidence PR is worse.
+		const p = fishersExactOneSidedLeft(0, 10, 10, 0);
+		expect(p).toBeLessThan(0.001);
+	});
+
+	it('returns 1 when PR matches baseline rates exactly', () => {
+		// PR 5/10, baseline 5/10 — left tail at the median is around 0.5 + symmetric mass
+		// at the observed value, but should be > 0.5 (we're at the center of the distribution).
+		const p = fishersExactOneSidedLeft(5, 5, 5, 5);
+		expect(p).toBeGreaterThan(0.5);
+	});
+});
+
+describe('wilsonInterval', () => {
+	it('returns [0, 1] for total=0', () => {
+		expect(wilsonInterval(0, 0)).toEqual({ lower: 0, upper: 1 });
+	});
+
+	it('produces reasonable bounds for 5/10', () => {
+		const ci = wilsonInterval(5, 10);
+		// Known Wilson 95% CI for 5/10: roughly [0.237, 0.763]
+		expect(ci.lower).toBeCloseTo(0.237, 2);
+		expect(ci.upper).toBeCloseTo(0.763, 2);
+	});
+
+	it('produces tight bounds for 0/100', () => {
+		const ci = wilsonInterval(0, 100);
+		expect(ci.lower).toBe(0);
+		expect(ci.upper).toBeLessThan(0.05);
+	});
+
+	it('produces tight bounds for 100/100', () => {
+		const ci = wilsonInterval(100, 100);
+		// upper analytically equals 1 but lands slightly under it after FP rounding —
+		// any reasonable CI for 100/100 should still be tight to the top of the range.
+		expect(ci.upper).toBeGreaterThanOrEqual(0.99);
+		expect(ci.lower).toBeGreaterThan(0.95);
+	});
+
+	it('throws when passes > total', () => {
+		expect(() => wilsonInterval(5, 3)).toThrow();
+	});
+});
+
+describe('classifyScenario', () => {
+	it('flags a clear regression on a reliable scenario as hard_regression', () => {
+		const result = classifyScenario(0, 10, 10, 10);
+		expect(result.verdict).toBe('hard_regression');
+		expect(result.delta).toBe(-1);
+	});
+
+	it('marks a hard-significant drop on an unreliable baseline as unreliable_baseline', () => {
+		// Baseline 4/10 (40%) — below hard reliable (70%). PR 0/10 is a 40pp drop with
+		// Fisher p < 0.05. We surface it as `unreliable_baseline` rather than flagging.
+		const result = classifyScenario(0, 10, 4, 10);
+		expect(result.verdict).toBe('unreliable_baseline');
+	});
+
+	it('reports stable when the drop is sub-MDE on a flaky baseline', () => {
+		// Baseline 1/10 (flaky), PR 0/10 — only a 10pp drop, below MDE.
+		const result = classifyScenario(0, 10, 1, 10);
+		expect(result.verdict).toBe('stable');
+	});
+
+	it('does not flag a small drop below the soft MDE threshold', () => {
+		// 9/10 vs 10/10 = 10pp drop, below soft MDE (15pp).
+		const result = classifyScenario(9, 10, 10, 10);
+		expect(result.verdict).toBe('stable');
+	});
+
+	it('flags an improvement when PR is significantly better', () => {
+		const result = classifyScenario(10, 10, 0, 10);
+		expect(result.verdict).toBe('improvement');
+	});
+
+	it('flags improvement even on a never-passing baseline', () => {
+		// "Never passes" baseline (0/10) — fix is worth surfacing without the reliability gate.
+		const result = classifyScenario(8, 10, 0, 10);
+		expect(result.verdict).toBe('improvement');
+	});
+
+	it('returns insufficient_data when either side has no trials', () => {
+		expect(classifyScenario(0, 0, 5, 10).verdict).toBe('insufficient_data');
+		expect(classifyScenario(5, 10, 0, 0).verdict).toBe('insufficient_data');
+	});
+
+	it('flags the most extreme outcome at minimum N as hard_regression', () => {
+		// PR 0/3 vs baseline 3/3 — Fisher one-sided p ≈ 0.05, delta = -100pp.
+		const result = classifyScenario(0, 3, 3, 3);
+		expect(result.verdict).toBe('hard_regression');
+	});
+
+	it('reports stable when N is small enough that even a full flip is sub-significant for soft tier', () => {
+		// PR 1/2 vs baseline 2/2 — delta -50pp but Fisher p ≈ 0.5 (way above soft α=0.20).
+		// Soft MDE met, but significance fails on both tiers.
+		const result = classifyScenario(1, 2, 2, 2);
+		expect(['stable', 'watch']).toContain(result.verdict);
+	});
+
+	it('marks soft regression when hard delta is missed but soft thresholds met', () => {
+		// 6/10 vs 10/10 = 40pp drop, p ≈ 0.043, baseline 100% reliable.
+		// Hard defaults would flag this; force a stricter hard delta to push it to soft.
+		const result = classifyScenario(6, 10, 10, 10, {
+			hard: { maxPValue: 0.05, minDelta: 0.5, minBaselinePassRate: 0.7 },
+			soft: { maxPValue: 0.2, minDelta: 0.15, minBaselinePassRate: 0.5 },
+		});
+		expect(result.verdict).toBe('soft_regression');
+	});
+
+	it('marks watch when delta crosses the watch threshold without significance', () => {
+		// 5/10 vs 7/10 = -20pp drop, p ≈ 0.32 — not significant for hard or soft.
+		// Default watchDelta is 0.35, so this should not be `watch`. Force it via
+		// a smaller threshold to validate the path.
+		const result = classifyScenario(5, 10, 7, 10, { watchDelta: 0.15 });
+		expect(result.verdict).toBe('watch');
+	});
+
+	it('respects custom hard-tier delta override', () => {
+		// 7/10 vs 10/10 = 30pp delta. Default hard minDelta is 0.3, so this barely qualifies.
+		// With hard.minDelta 0.4, it drops into `soft_regression` (still passes soft 0.15 minDelta).
+		// p ≈ 0.105 < soft maxPValue (0.2), so soft fires.
+		const result = classifyScenario(7, 10, 10, 10, {
+			hard: { minDelta: 0.4 },
+		});
+		expect(result.verdict).toBe('soft_regression');
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/cli/args.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/args.ts
@ -45,7 +45,7 @@ export interface CliArgs {
 // ---------------------------------------------------------------------------

 const cliArgsSchema = z.object({
-	timeoutMs: z.number().int().positive().default(600_000),
+	timeoutMs: z.number().int().positive().default(900_000),
 	baseUrls: z.array(z.string().url()).min(1).default(['http://localhost:5678']),
 	email: z.string().optional(),
 	password: z.string().optional(),
@ -104,7 +104,7 @@ interface RawArgs {

 function parseRawArgs(argv: string[]): RawArgs {
 	const result: RawArgs = {
-		timeoutMs: 600_000,
+		timeoutMs: 900_000,
 		baseUrls: ['http://localhost:5678'],
 		verbose: false,
 		keepWorkflows: false,
--- a/packages/@n8n/instance-ai/evaluations/cli/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/index.ts
@ -23,6 +23,15 @@ import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
 import { LaneAllocator } from './lane-allocator';
 import { expandWithIterations, partitionRoundRobin } from './lanes';
 import { N8nClient } from '../clients/n8n-client';
+import {
+	compareBuckets,
+	type ComparisonOutcome,
+	type ComparisonResult,
+	type ExperimentBucket,
+	type ScenarioCounts,
+} from '../comparison/compare';
+import { fetchBaselineBucket, findLatestBaseline } from '../comparison/fetch-baseline';
+import { formatComparisonMarkdown, formatComparisonTerminal } from '../comparison/format';
 import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
 import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
 import type { WorkflowTestCaseWithFile } from '../data/workflows';
@ -43,6 +52,7 @@ import type {
 	MultiRunEvaluation,
 	ScenarioResult,
 	TestScenario,
+	WorkflowTestCase,
 	WorkflowTestCaseResult,
 } from '../types';

@ -160,21 +170,40 @@ async function main(): Promise<void> {
 		const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);

 		let evaluation: MultiRunEvaluation;
+		let experimentName: string | undefined;
+		let outcome: ComparisonOutcome | undefined;
+		let slugByTestCase: Map<WorkflowTestCase, string> | undefined;

 		if (hasLangSmith) {
 			logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
-			evaluation = await runWithLangSmith({ args, lanes, logger });
+			const langsmithRun = await runWithLangSmith({ args, lanes, logger });
+			evaluation = langsmithRun.evaluation;
+			experimentName = langsmithRun.experimentName;
+			outcome = langsmithRun.outcome;
+			slugByTestCase = langsmithRun.slugByTestCase;
 		} else {
 			logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
 			evaluation = await runDirectLoop({ args, lanes, logger });
 		}

 		const totalDuration = Date.now() - startTime;
-		const outputPath = writeEvalResults(evaluation, totalDuration, args.outputDir);
-		console.log(`Results: ${outputPath}`);
+		const commitSha = process.env.LANGSMITH_REVISION_ID ?? process.env.GITHUB_SHA;
+		const { jsonPath, prCommentPath } = writeEvalResults(
+			evaluation,
+			totalDuration,
+			args.outputDir,
+			experimentName,
+			outcome,
+			commitSha,
+			slugByTestCase,
+		);
+		console.log(`Results:    ${jsonPath}`);
+		console.log(`PR comment: ${prCommentPath}`);
 		const htmlPath = writeWorkflowReport(flattenRunsForReport(evaluation));
-		console.log(`Report:  ${htmlPath}`);
-		printSummary(evaluation);
+		console.log(`Report:     ${htmlPath}`);
+		console.log(
+			'\n' + formatComparisonTerminal(evaluation, outcome, { commitSha, slugByTestCase }),
+		);
 	} finally {
 		await Promise.all(
 			lanes.map(async (lane) => {
@ -188,7 +217,12 @@ async function main(): Promise<void> {
 // LangSmith mode: evaluate() with dataset sync, tracing, experiments
 // ---------------------------------------------------------------------------

-async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation> {
+async function runWithLangSmith(config: RunConfig): Promise<{
+	evaluation: MultiRunEvaluation;
+	experimentName: string;
+	outcome: ComparisonOutcome;
+	slugByTestCase: Map<WorkflowTestCase, string>;
+}> {
 	const { args, lanes, logger } = config;

 	const lsClient = new Client();
@ -466,7 +500,24 @@ async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation>
 			logger,
 		});

-		return evaluation;
+		const outcome = await tryRunComparison({
+			lsClient,
+			prExperimentName: experimentResults.experimentName,
+			evaluation,
+			testCasesWithFiles,
+			logger,
+		});
+
+		const slugByTestCase = new Map<WorkflowTestCase, string>(
+			testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
+		);
+
+		return {
+			evaluation,
+			experimentName: experimentResults.experimentName,
+			outcome,
+			slugByTestCase,
+		};
 	} finally {
 		if (!args.keepWorkflows) {
 			await Promise.all(
@ -711,39 +762,41 @@ async function runDirectLoop(config: RunConfig): Promise<MultiRunEvaluation> {
 	const indexed = testCasesWithFiles.map((tc, origIdx) => ({ tc, origIdx }));
 	const buckets = partitionRoundRobin(indexed, lanes.length);

-	const allRunResults: WorkflowTestCaseResult[][] = [];
-	for (let iter = 0; iter < args.iterations; iter++) {
-		if (args.iterations > 1) {
-			logger.info(`--- Iteration #${String(iter + 1)}/${String(args.iterations)} ---`);
-		}
-		const laneResults = await Promise.all(
-			lanes.map(async (lane, laneIdx) => {
-				const bucket = buckets[laneIdx];
-				const laneTag =
-					lanes.length > 1 ? ` [lane ${String(laneIdx + 1)}/${String(lanes.length)}]` : '';
-				const results = await runWithConcurrency(
-					bucket,
-					async ({ tc }) =>
-						await runWorkflowTestCase({
-							client: lane.client,
-							testCase: tc.testCase,
-							timeoutMs: args.timeoutMs,
-							seededCredentialTypes: lane.seedResult.seededTypes,
-							preRunWorkflowIds: lane.preRunWorkflowIds,
-							claimedWorkflowIds: lane.claimedWorkflowIds,
-							logger,
-							keepWorkflows: args.keepWorkflows,
-							laneTag,
-						}),
-					MAX_CONCURRENT_BUILDS,
-				);
-				return bucket.map((b, i) => ({ origIdx: b.origIdx, result: results[i] }));
-			}),
-		);
-		const flat = laneResults.flat();
-		flat.sort((a, b) => a.origIdx - b.origIdx);
-		allRunResults.push(flat.map((x) => x.result));
-	}
+	// Iterations are independent — run them in parallel.
+	const allRunResults: WorkflowTestCaseResult[][] = await Promise.all(
+		Array.from({ length: args.iterations }, async (_unused, iter) => {
+			if (args.iterations > 1) {
+				logger.info(`--- Iteration #${String(iter + 1)}/${String(args.iterations)} starting ---`);
+			}
+			const laneResults = await Promise.all(
+				lanes.map(async (lane, laneIdx) => {
+					const bucket = buckets[laneIdx];
+					const laneTag =
+						lanes.length > 1 ? ` [lane ${String(laneIdx + 1)}/${String(lanes.length)}]` : '';
+					const results = await runWithConcurrency(
+						bucket,
+						async ({ tc }) =>
+							await runWorkflowTestCase({
+								client: lane.client,
+								testCase: tc.testCase,
+								timeoutMs: args.timeoutMs,
+								seededCredentialTypes: lane.seedResult.seededTypes,
+								preRunWorkflowIds: lane.preRunWorkflowIds,
+								claimedWorkflowIds: lane.claimedWorkflowIds,
+								logger,
+								keepWorkflows: args.keepWorkflows,
+								laneTag,
+							}),
+						MAX_CONCURRENT_BUILDS,
+					);
+					return bucket.map((b, i) => ({ origIdx: b.origIdx, result: results[i] }));
+				}),
+			);
+			const flat = laneResults.flat();
+			flat.sort((a, b) => a.origIdx - b.origIdx);
+			return flat.map((x) => x.result);
+		}),
+	);

 	return aggregateResults(allRunResults, args.iterations);
 }
@ -826,15 +879,22 @@ function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
 function writeEvalResults(
 	evaluation: MultiRunEvaluation,
 	duration: number,
-	outputDir?: string,
-): string {
+	outputDir: string | undefined,
+	experimentName: string | undefined,
+	outcome: ComparisonOutcome | undefined,
+	commitSha: string | undefined,
+	slugByTestCase: Map<WorkflowTestCase, string> | undefined,
+): { jsonPath: string; prCommentPath: string } {
 	const { totalRuns, testCases } = evaluation;
 	const metrics = computeAggregateMetrics(evaluation);

+	const result = outcome?.kind === 'ok' ? outcome.result : undefined;
+
 	const report = {
 		timestamp: new Date().toISOString(),
 		duration,
 		totalRuns,
+		experimentName,
 		summary: {
 			testCases: testCases.length,
 			built: metrics.built,
@ -843,6 +903,19 @@ function writeEvalResults(
 			passHatK: metrics.passHatK,
 			passRatePerIter: metrics.passRatePerIter,
 		},
+		// Structured comparison payload only — the rendered markdown lives in
+		// the sibling `eval-pr-comment.md` file so consumers can pick the format
+		// they want without re-running the eval. `comparisonStatus` records why
+		// the comparison was skipped when applicable, so JSON consumers can
+		// distinguish "no baseline yet" from "regression detection broke".
+		comparison: result
+			? {
+					baseline: result.baseline.experimentName,
+					result: serializeComparison(result),
+				}
+			: undefined,
+		comparisonStatus: outcome?.kind ?? 'not_attempted',
+		comparisonError: outcome?.kind === 'fetch_failed' ? outcome.error : undefined,
 		testCases: testCases.map((tc) => ({
 			name: tc.testCase.prompt.slice(0, 70),
 			buildSuccessCount: tc.buildSuccessCount,
@ -868,74 +941,137 @@ function writeEvalResults(

 	const targetDir = outputDir ?? process.cwd();
 	mkdirSync(targetDir, { recursive: true });
-	const outputPath = join(targetDir, 'eval-results.json');
-	writeFileSync(outputPath, JSON.stringify(report, null, 2));
-	return outputPath;
+	const jsonPath = join(targetDir, 'eval-results.json');
+	writeFileSync(jsonPath, JSON.stringify(report, null, 2));
+
+	// Always write the rendered PR comment — the markdown formatter handles
+	// both with-comparison and no-baseline cases. CI consumes this file
+	// directly; local users get a copy-pasteable artifact.
+	const prCommentPath = join(targetDir, 'eval-pr-comment.md');
+	writeFileSync(
+		prCommentPath,
+		formatComparisonMarkdown(evaluation, outcome, { commitSha, slugByTestCase }),
+	);
+
+	return { jsonPath, prCommentPath };
+}
+
+/**
+ * Convert ComparisonResult into a JSON-serializable shape (Maps don't survive
+ * JSON.stringify by default).
+ */
+function serializeComparison(result: ComparisonResult): {
+	pr: { experimentName: string };
+	baseline: { experimentName: string };
+	aggregate: ComparisonResult['aggregate'];
+	scenarios: ComparisonResult['scenarios'];
+	prOnly: ComparisonResult['prOnly'];
+	baselineOnly: ComparisonResult['baselineOnly'];
+	failureCategories: ComparisonResult['failureCategories'];
+} {
+	return {
+		pr: result.pr,
+		baseline: result.baseline,
+		aggregate: result.aggregate,
+		scenarios: result.scenarios,
+		prOnly: result.prOnly,
+		baselineOnly: result.baselineOnly,
+		failureCategories: result.failureCategories,
+	};
 }

 // ---------------------------------------------------------------------------
-// Console summary
+// Comparison vs the pinned baseline experiment
 // ---------------------------------------------------------------------------

-function printSummary(evaluation: MultiRunEvaluation): void {
-	const { totalRuns, testCases } = evaluation;
-	const multiRun = totalRuns > 1;
-	const metrics = computeAggregateMetrics(evaluation);
+/**
+ * Best-effort comparison. Returns a tagged outcome so the PR comment can
+ * distinguish "no baseline yet" / "this run IS the baseline" from a real
+ * regression-detection outage (LangSmith down, fetch failure). Never throws
+ * — the eval run is not gated on the comparison.
+ */
+async function tryRunComparison(config: {
+	lsClient: Client;
+	prExperimentName: string;
+	evaluation: MultiRunEvaluation;
+	testCasesWithFiles: WorkflowTestCaseWithFile[];
+	logger: EvalLogger;
+}): Promise<ComparisonOutcome> {
+	const { lsClient, prExperimentName, evaluation, testCasesWithFiles, logger } = config;

-	console.log('\n=== Workflow Eval Results ===\n');
-	for (const tc of testCases) {
-		console.log(`${tc.testCase.prompt.slice(0, 70)}...`);
-
-		if (multiRun) {
-			console.log(`  Build: ${String(tc.buildSuccessCount)}/${String(totalRuns)} runs`);
-		} else {
-			const r = tc.runs[0];
-			const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
-			console.log(`  Workflow: ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
-			if (r.buildError) {
-				console.log(`  Error: ${r.buildError.slice(0, 200)}`);
-			}
+	try {
+		const baselineName = await findLatestBaseline(lsClient);
+		if (!baselineName) {
+			logger.verbose(
+				'No baseline experiment found — skipping comparison. ' +
+					'Run with --experiment-name instance-ai-baseline to create one.',
+			);
+			return { kind: 'no_baseline' };
+		}
+		if (baselineName === prExperimentName) {
+			logger.verbose('Current run is the baseline — skipping comparison.');
+			return { kind: 'self_baseline', experimentName: baselineName };
 		}

+		logger.info(`Comparing against baseline: ${baselineName}`);
+		const baseline = await fetchBaselineBucket(lsClient, baselineName);
+		const pr = bucketFromEvaluation(evaluation, testCasesWithFiles, prExperimentName);
+		return { kind: 'ok', result: compareBuckets(pr, baseline) };
+	} catch (error: unknown) {
+		const msg = error instanceof Error ? error.message : String(error);
+		logger.warn(`Comparison vs baseline failed: ${msg}`);
+		return { kind: 'fetch_failed', error: msg };
+	}
+}
+
+/**
+ * Project the in-memory MultiRunEvaluation onto the bucket shape used by
+ * fetchBaselineBucket, keyed by `${fileSlug}/${scenarioName}`.
+ *
+ * Looks up `fileSlug` by test case reference rather than array index — the
+ * comparison key depends on getting the right slug, and zipping by index
+ * silently miscompares if anything ever reorders the aggregate.
+ */
+function bucketFromEvaluation(
+	evaluation: MultiRunEvaluation,
+	testCasesWithFiles: WorkflowTestCaseWithFile[],
+	experimentName: string,
+): ExperimentBucket {
+	const slugByTestCase = new Map(
+		testCasesWithFiles.map(({ testCase, fileSlug }) => [testCase, fileSlug]),
+	);
+	const scenarios = new Map<string, ScenarioCounts>();
+	const failureCategoryTotals: Record<string, number> = {};
+	let trialTotal = 0;
+	for (const tc of evaluation.testCases) {
+		const fileSlug = slugByTestCase.get(tc.testCase);
+		if (!fileSlug) {
+			throw new Error(
+				`bucketFromEvaluation: no fileSlug for test case "${tc.testCase.prompt.slice(0, 60)}"`,
+			);
+		}
+		const total = tc.runs.length;
 		for (const sa of tc.scenarios) {
-			if (multiRun) {
-				const passAtK = Math.round((sa.passAtK[metrics.kIndex] ?? 0) * 100);
-				const passHatK = Math.round((sa.passHatK[metrics.kIndex] ?? 0) * 100);
-				console.log(
-					`  ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` +
-						` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`,
-				);
-			} else {
-				const sr = sa.runs[0];
-				const icon = sr.success ? '✓' : '✗';
-				const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
-				console.log(
-					`  ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'}${category} (${String(sr.score * 100)}%)`,
-				);
-				if (!sr.success) {
-					const execErrors = sr.evalResult?.errors ?? [];
-					if (execErrors.length > 0) {
-						console.log(`    Error: ${execErrors.join('; ').slice(0, 200)}`);
-					}
-					console.log(`    Diagnosis: ${sr.reasoning.slice(0, 200)}`);
+			const key = `${fileSlug}/${sa.scenario.name}`;
+			const failureCategories: Record<string, number> = {};
+			for (const sr of sa.runs) {
+				trialTotal++;
+				if (!sr.success && sr.failureCategory) {
+					failureCategories[sr.failureCategory] = (failureCategories[sr.failureCategory] ?? 0) + 1;
+					failureCategoryTotals[sr.failureCategory] =
+						(failureCategoryTotals[sr.failureCategory] ?? 0) + 1;
 				}
 			}
+			scenarios.set(key, {
+				testCaseFile: fileSlug,
+				scenarioName: sa.scenario.name,
+				passed: sa.passCount,
+				total,
+				failureCategories,
+			});
 		}
-		console.log('');
-	}
-
-	if (multiRun) {
-		console.log(
-			`${String(metrics.built)}/${String(testCases.length)} built | pass@${String(totalRuns)}: ${String(Math.round(metrics.passAtK * 100))}% | pass^${String(totalRuns)}: ${String(Math.round(metrics.passHatK * 100))}% | iterations: ${metrics.passRatePerIter}`,
-		);
-	} else {
-		const allScenarios = testCases.flatMap((tc) => tc.scenarios);
-		const passed = allScenarios.filter((s) => s.runs[0]?.success).length;
-		const total = metrics.scenariosTotal;
-		console.log(
-			`${String(metrics.built)}/${String(testCases.length)} built | ${String(passed)}/${String(total)} passed (${String(total > 0 ? Math.round((passed / total) * 100) : 0)}%)`,
-		);
 	}
+	return { experimentName, scenarios, failureCategoryTotals, trialTotal };
 }

 main().catch((error) => {
--- a/packages/@n8n/instance-ai/evaluations/comparison/compare.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/compare.ts
@ -0,0 +1,333 @@
+// ---------------------------------------------------------------------------
+// Comparison core: take two experiment buckets, return a ComparisonResult.
+//
+// Pure function, no I/O. The tier thresholds (p-value cutoff, minimum delta,
+// minimum baseline pass rate) live in statistics.ts — there's no CLI knob.
+// Tune them there if the false-positive rate drifts.
+// ---------------------------------------------------------------------------
+
+import {
+	classifyScenario,
+	wilsonInterval,
+	type ClassifyOptions,
+	type ScenarioClassification,
+	type ScenarioVerdict,
+} from './statistics';
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface ScenarioCounts {
+	testCaseFile: string;
+	scenarioName: string;
+	passed: number;
+	total: number;
+	failureCategories?: Record<string, number>;
+}
+
+export interface ExperimentBucket {
+	experimentName: string;
+	scenarios: Map<string, ScenarioCounts>;
+	/**
+	 * Aggregated failure-category counts across all trials in all scenarios.
+	 * Used for the run-level failure-category drift table — orthogonal to
+	 * per-scenario verdicts.
+	 */
+	failureCategoryTotals?: Record<string, number>;
+	trialTotal?: number;
+}
+
+export interface ScenarioComparison extends ScenarioClassification {
+	testCaseFile: string;
+	scenarioName: string;
+	prPasses: number;
+	prTotal: number;
+	baselinePasses: number;
+	baselineTotal: number;
+}
+
+export interface AggregateComparison {
+	intersectionSize: number;
+	prAggregatePassRate: number;
+	baselineAggregatePassRate: number;
+	prAggregateCI: { lower: number; upper: number };
+	baselineAggregateCI: { lower: number; upper: number };
+	delta: number;
+}
+
+export interface FailureCategoryComparison {
+	category: string;
+	prCount: number;
+	prRate: number; // count / trialTotal
+	baselineCount: number;
+	baselineRate: number;
+	delta: number; // prRate − baselineRate
+	notable: boolean;
+}
+
+export interface ComparisonResult {
+	pr: { experimentName: string };
+	baseline: { experimentName: string };
+	aggregate: AggregateComparison;
+	scenarios: ScenarioComparison[];
+	prOnly: Array<{ testCaseFile: string; scenarioName: string }>;
+	baselineOnly: Array<{ testCaseFile: string; scenarioName: string }>;
+	failureCategories: FailureCategoryComparison[];
+}
+
+/**
+ * Result of a comparison attempt. The `kind` field distinguishes between
+ * "ran successfully", "skipped intentionally" (no baseline yet, current run
+ * IS the baseline), and "failed unexpectedly" (LangSmith API error, fetch
+ * timeout, etc.). The PR comment renders a different alert per kind so
+ * readers can tell a missing baseline from a regression-detection outage.
+ */
+export type ComparisonOutcome =
+	| { kind: 'ok'; result: ComparisonResult }
+	| { kind: 'no_baseline' }
+	| { kind: 'self_baseline'; experimentName: string }
+	| { kind: 'fetch_failed'; error: string };
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/** Hard regressions only — high-confidence, gating-grade flags. */
+export function hardRegressions(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'hard_regression');
+}
+
+/** Soft regressions — looser thresholds, worth investigating but not gating. */
+export function softRegressions(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'soft_regression');
+}
+
+/** Movement ≥ watchDelta without reaching a flag tier. Visibility only. */
+export function watchList(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'watch');
+}
+
+export function improvements(result: ComparisonResult): ScenarioComparison[] {
+	return result.scenarios.filter((s) => s.verdict === 'improvement');
+}
+
+export function byVerdict(result: ComparisonResult): Record<ScenarioVerdict, number> {
+	const counts: Record<ScenarioVerdict, number> = {
+		hard_regression: 0,
+		soft_regression: 0,
+		watch: 0,
+		improvement: 0,
+		stable: 0,
+		unreliable_baseline: 0,
+		insufficient_data: 0,
+	};
+	for (const s of result.scenarios) counts[s.verdict]++;
+	return counts;
+}
+
+// ---------------------------------------------------------------------------
+// Compare
+// ---------------------------------------------------------------------------
+
+/**
+ * Compare two experiment buckets and produce a structured comparison result.
+ *
+ * Aggregate is computed over the *intersection* of scenarios — the only
+ * scenarios for which the rates are directly comparable. PR-only and
+ * baseline-only scenarios are surfaced separately, not folded into the
+ * aggregate.
+ *
+ * Aggregate pass rate is the *micro* average — total passes / total trials
+ * across the intersection.
+ *
+ * `options` exists for tests; production callers pass nothing.
+ */
+export function compareBuckets(
+	pr: ExperimentBucket,
+	baseline: ExperimentBucket,
+	options: ClassifyOptions = {},
+): ComparisonResult {
+	const scenarios: ScenarioComparison[] = [];
+	const prOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
+	const baselineOnly: Array<{ testCaseFile: string; scenarioName: string }> = [];
+
+	let prIPasses = 0;
+	let prITotal = 0;
+	let baseIPasses = 0;
+	let baseITotal = 0;
+
+	for (const [key, prCounts] of pr.scenarios) {
+		const baseCounts = baseline.scenarios.get(key);
+		if (!baseCounts) {
+			prOnly.push({
+				testCaseFile: prCounts.testCaseFile,
+				scenarioName: prCounts.scenarioName,
+			});
+			continue;
+		}
+
+		prIPasses += prCounts.passed;
+		prITotal += prCounts.total;
+		baseIPasses += baseCounts.passed;
+		baseITotal += baseCounts.total;
+
+		const classification = classifyScenario(
+			prCounts.passed,
+			prCounts.total,
+			baseCounts.passed,
+			baseCounts.total,
+			options,
+		);
+		scenarios.push({
+			testCaseFile: prCounts.testCaseFile,
+			scenarioName: prCounts.scenarioName,
+			prPasses: prCounts.passed,
+			prTotal: prCounts.total,
+			baselinePasses: baseCounts.passed,
+			baselineTotal: baseCounts.total,
+			...classification,
+		});
+	}
+
+	for (const [key, baseCounts] of baseline.scenarios) {
+		if (!pr.scenarios.has(key)) {
+			baselineOnly.push({
+				testCaseFile: baseCounts.testCaseFile,
+				scenarioName: baseCounts.scenarioName,
+			});
+		}
+	}
+
+	const aggregate: AggregateComparison = {
+		intersectionSize: scenarios.length,
+		prAggregatePassRate: rate(prIPasses, prITotal),
+		baselineAggregatePassRate: rate(baseIPasses, baseITotal),
+		prAggregateCI: wilsonInterval(prIPasses, prITotal),
+		baselineAggregateCI: wilsonInterval(baseIPasses, baseITotal),
+		delta: rate(prIPasses, prITotal) - rate(baseIPasses, baseITotal),
+	};
+
+	scenarios.sort(scenarioComparator);
+
+	const failureCategories = compareFailureCategories(pr, baseline);
+
+	return {
+		pr: { experimentName: pr.experimentName },
+		baseline: { experimentName: baseline.experimentName },
+		aggregate,
+		scenarios,
+		prOnly,
+		baselineOnly,
+		failureCategories,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// Failure-category drift
+// ---------------------------------------------------------------------------
+
+/** Min absolute rate gap to consider a category notable (5 percentage points). */
+const CATEGORY_NOTABLE_RATE_DELTA = 0.05;
+/** Min absolute trial-count gap (over scaling) required alongside the rate gap. */
+const CATEGORY_NOTABLE_COUNT_DELTA = 3;
+
+/**
+ * Categories the verifier is supposed to emit. Anything else (malformed
+ * strings like `-`, `>builder_issue`, empty, etc.) is dropped from the
+ * comparison so the PR comment doesn't display verifier noise. Keep in sync
+ * with the verifier's category enum; unknown values are logged at verbose
+ * level via the console (see compareFailureCategories).
+ */
+const KNOWN_FAILURE_CATEGORIES = new Set([
+	'builder_issue',
+	'mock_issue',
+	'framework_issue',
+	'verification_failure',
+	'build_failure',
+]);
+
+function isCategoryNotable(
+	prCount: number,
+	prTotal: number,
+	baselineCount: number,
+	baselineTotal: number,
+): boolean {
+	const rateGap = Math.abs(prCount / prTotal - baselineCount / baselineTotal);
+	if (rateGap < CATEGORY_NOTABLE_RATE_DELTA) return false;
+	const expectedPrCount = baselineCount * (prTotal / baselineTotal);
+	const countGap = Math.abs(prCount - expectedPrCount);
+	return countGap >= CATEGORY_NOTABLE_COUNT_DELTA;
+}
+
+function compareFailureCategories(
+	pr: ExperimentBucket,
+	baseline: ExperimentBucket,
+): FailureCategoryComparison[] {
+	if (!pr.failureCategoryTotals || !baseline.failureCategoryTotals) return [];
+	const prTotal = pr.trialTotal ?? 0;
+	const baseTotal = baseline.trialTotal ?? 0;
+	if (prTotal === 0 || baseTotal === 0) return [];
+
+	// Surface unrecognised values so we notice when the verifier adds a new
+	// category (or starts emitting noise we should clean up). Doesn't enter
+	// the comparison output; the renderer only knows about KNOWN_FAILURE_CATEGORIES.
+	for (const category of Object.keys(pr.failureCategoryTotals)) {
+		if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
+			console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
+		}
+	}
+	for (const category of Object.keys(baseline.failureCategoryTotals)) {
+		if (!KNOWN_FAILURE_CATEGORIES.has(category)) {
+			console.warn(`[comparison] dropping unknown failureCategory "${category}"`);
+		}
+	}
+
+	// Always emit a row for every known category, even if both sides are 0.
+	// The renderer can decide whether to suppress 0/0 rows; this gives readers
+	// a complete picture of the failure-type taxonomy by default.
+	const out: FailureCategoryComparison[] = [];
+	for (const category of KNOWN_FAILURE_CATEGORIES) {
+		const prCount = pr.failureCategoryTotals[category] ?? 0;
+		const baselineCount = baseline.failureCategoryTotals[category] ?? 0;
+		out.push({
+			category,
+			prCount,
+			prRate: prCount / prTotal,
+			baselineCount,
+			baselineRate: baselineCount / baseTotal,
+			delta: prCount / prTotal - baselineCount / baseTotal,
+			notable: isCategoryNotable(prCount, prTotal, baselineCount, baseTotal),
+		});
+	}
+
+	// Sort: notable first, then by absolute delta descending.
+	out.sort((a, b) => {
+		if (a.notable !== b.notable) return a.notable ? -1 : 1;
+		return Math.abs(b.delta) - Math.abs(a.delta);
+	});
+	return out;
+}
+
+function rate(passes: number, total: number): number {
+	return total > 0 ? passes / total : 0;
+}
+
+const VERDICT_ORDER: Record<ScenarioComparison['verdict'], number> = {
+	hard_regression: 0,
+	soft_regression: 1,
+	improvement: 2,
+	watch: 3,
+	unreliable_baseline: 4,
+	stable: 5,
+	insufficient_data: 6,
+};
+
+function scenarioComparator(a: ScenarioComparison, b: ScenarioComparison): number {
+	const av = VERDICT_ORDER[a.verdict];
+	const bv = VERDICT_ORDER[b.verdict];
+	if (av !== bv) return av - bv;
+	const fileCmp = a.testCaseFile.localeCompare(b.testCaseFile);
+	if (fileCmp !== 0) return fileCmp;
+	return a.scenarioName.localeCompare(b.scenarioName);
+}
--- a/packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/fetch-baseline.ts
@ -0,0 +1,123 @@
+// ---------------------------------------------------------------------------
+// Find and fetch the pinned baseline experiment from LangSmith.
+//
+// The baseline is whichever experiment most recently used the
+// `instance-ai-baseline` prefix. To refresh, run the eval with that prefix:
+//
+//   pnpm eval:instance-ai --experiment-name instance-ai-baseline --iterations 10
+//
+// LangSmith appends a random suffix, so successive baseline runs become
+// `instance-ai-baseline-7abc1234`, `instance-ai-baseline-9def5678`, etc.
+// We pick the most recently started one.
+//
+// Two functions, both small:
+//
+//   findLatestBaseline    — list baseline-prefixed projects, pick newest.
+//   fetchBaselineBucket   — read its root runs, bucket per scenario.
+//
+// Both throw on transport errors. Callers are expected to swallow with a log:
+// the comparison is advisory and shouldn't fail the eval run.
+// ---------------------------------------------------------------------------
+
+import type { Client } from 'langsmith';
+import { z } from 'zod';
+
+import type { ExperimentBucket, ScenarioCounts } from './compare';
+
+/**
+ * Prefix the latest-baseline lookup matches against. The CLI flag
+ * `--experiment-name instance-ai-baseline` produces project names like
+ * `instance-ai-baseline-7abc1234` (LangSmith appends a hyphen + suffix), so
+ * the constant must end in `-` to avoid matching unrelated names that
+ * happen to start with `instance-ai-baseline...`.
+ */
+export const BASELINE_EXPERIMENT_PREFIX = 'instance-ai-baseline-';
+
+const inputsSchema = z
+	.object({
+		testCaseFile: z.string().default(''),
+		scenarioName: z.string().default(''),
+	})
+	.passthrough();
+
+const outputsSchema = z
+	.object({
+		passed: z.boolean().default(false),
+		failureCategory: z.string().optional(),
+	})
+	.passthrough();
+
+/**
+ * Return the most recently created baseline experiment, or `undefined` if
+ * none exist. We pick by `start_time` so a re-run of an older snapshot
+ * doesn't displace the latest one.
+ */
+export async function findLatestBaseline(client: Client): Promise<string | undefined> {
+	let latest: { name: string; ts: number } | undefined;
+	for await (const project of client.listProjects({ nameContains: BASELINE_EXPERIMENT_PREFIX })) {
+		const name = project.name;
+		if (!name?.startsWith(BASELINE_EXPERIMENT_PREFIX)) continue;
+		const ts = project.start_time ? new Date(project.start_time).getTime() : 0;
+		if (!latest || ts > latest.ts) latest = { name, ts };
+	}
+	return latest?.name;
+}
+
+/**
+ * Fetch a baseline experiment's per-scenario pass/fail counts. Each root run
+ * corresponds to one (testCaseFile, scenarioName, iteration) triple — we
+ * bucket by `${testCaseFile}/${scenarioName}` and accumulate.
+ *
+ * Throws if the project does not exist.
+ */
+export async function fetchBaselineBucket(
+	client: Client,
+	experimentName: string,
+): Promise<ExperimentBucket> {
+	const project = await client.readProject({ projectName: experimentName });
+	const scenarios = new Map<string, ScenarioCounts>();
+	const failureCategoryTotals: Record<string, number> = {};
+	let trialTotal = 0;
+
+	for await (const run of client.listRuns({ projectId: project.id, isRoot: true })) {
+		const inputs = inputsSchema.safeParse(run.inputs ?? {});
+		if (!inputs.success || !inputs.data.testCaseFile || !inputs.data.scenarioName) continue;
+		// Skip runs that never produced outputs (still running, crashed before
+		// completion, infra error). Without this guard, every field defaults
+		// (passed → false) would coerce them into "failed" trials and inflate
+		// the baseline failure count. Mirrors `parseTargetOutput` in cli/index.ts.
+		const rawOutputs = run.outputs;
+		if (
+			rawOutputs === null ||
+			rawOutputs === undefined ||
+			typeof rawOutputs !== 'object' ||
+			Object.keys(rawOutputs).length === 0
+		) {
+			continue;
+		}
+		const outputs = outputsSchema.safeParse(rawOutputs);
+		if (!outputs.success) continue;
+
+		const key = `${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
+		const existing: ScenarioCounts = scenarios.get(key) ?? {
+			testCaseFile: inputs.data.testCaseFile,
+			scenarioName: inputs.data.scenarioName,
+			passed: 0,
+			total: 0,
+			failureCategories: {},
+		};
+		existing.total++;
+		trialTotal++;
+		if (outputs.data.passed) {
+			existing.passed++;
+		} else if (outputs.data.failureCategory) {
+			const cat = outputs.data.failureCategory;
+			existing.failureCategories = existing.failureCategories ?? {};
+			existing.failureCategories[cat] = (existing.failureCategories[cat] ?? 0) + 1;
+			failureCategoryTotals[cat] = (failureCategoryTotals[cat] ?? 0) + 1;
+		}
+		scenarios.set(key, existing);
+	}
+
+	return { experimentName, scenarios, failureCategoryTotals, trialTotal };
+}
--- a/packages/@n8n/instance-ai/evaluations/comparison/format.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/format.ts
@ -0,0 +1,961 @@
+// ---------------------------------------------------------------------------
+// Render the eval run as a PR comment (markdown) or a console summary
+// (aligned plain text). Both formats are driven by:
+//
+//   - MultiRunEvaluation — pass rates, build counts, per-trial reasoning
+//   - ComparisonOutcome (optional) — tagged result of the baseline
+//     comparison: `ok` (ran, has scenarios), `no_baseline` (skipped), or
+//     `fetch_failed` / `self_baseline` (skipped for cause). Each kind
+//     drives a distinct top-of-comment alert so a LangSmith outage doesn't
+//     get dressed up as "no baseline configured".
+//
+// When no comparison is available (no baseline yet, LangSmith offline)
+// the renderers still produce a useful per-test-case summary. When a
+// comparison is available, sections render in priority order:
+// regressions, soft regressions, notable movement, improvements,
+// failure-category drift. Only sections with content are emitted.
+// ---------------------------------------------------------------------------
+
+import {
+	hardRegressions,
+	improvements,
+	softRegressions,
+	watchList,
+	type ComparisonOutcome,
+	type ComparisonResult,
+	type FailureCategoryComparison,
+	type ScenarioComparison,
+} from './compare';
+import type {
+	MultiRunEvaluation,
+	TestCaseAggregation,
+	WorkflowTestCase,
+	WorkflowTestCaseResult,
+} from '../types';
+
+interface FormatOptions {
+	/** Optional commit SHA to include in the heading. Truncated to 8 chars. */
+	commitSha?: string;
+	/** Maps each test-case reference to its file slug. When provided, the
+	 *  per-scenario failure breakdown looks up failed runs by
+	 *  `${fileSlug}/${scenarioName}` — deterministic across collisions like
+	 *  multiple `happy-path` scenarios. When omitted, the breakdown is
+	 *  skipped (no name-only fallback — that lookup was wrong on real data). */
+	slugByTestCase?: Map<WorkflowTestCase, string>;
+}
+
+// ---------------------------------------------------------------------------
+// Markdown PR comment
+// ---------------------------------------------------------------------------
+
+export function formatComparisonMarkdown(
+	evaluation: MultiRunEvaluation,
+	outcome?: ComparisonOutcome,
+	options: FormatOptions = {},
+): string {
+	const lines: string[] = [];
+	const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
+
+	lines.push(formatHeading(options.commitSha));
+	lines.push('');
+	lines.push(formatTopAlert(outcome));
+	lines.push('');
+	lines.push(formatAggregateBlock(evaluation, comparison));
+	lines.push('');
+
+	if (comparison) {
+		const hard = hardRegressions(comparison);
+		const soft = softRegressions(comparison);
+		const watch = watchList(comparison);
+		const imps = improvements(comparison);
+
+		const renderedAnyTable = hard.length > 0 || soft.length > 0 || imps.length > 0;
+
+		// Built once and reused across the regression-tier sections so each
+		// scenario row can carry a collapsible breakdown of its failed PR runs.
+		// Improvements skip the breakdown — they passed. Skipped entirely when
+		// the caller didn't pass a slug map (lookup would be ambiguous).
+		const failedIndex = options.slugByTestCase
+			? buildFailedRunsIndex(evaluation, options.slugByTestCase)
+			: undefined;
+
+		if (hard.length > 0) {
+			lines.push(
+				...renderScenarioSection('Regressions', '— high-confidence', hard, true, failedIndex),
+			);
+		}
+		if (soft.length > 0) {
+			lines.push(
+				...renderScenarioSection(
+					'Soft regressions',
+					'— investigate if related to your changes',
+					soft,
+					true,
+					failedIndex,
+				),
+			);
+		}
+		if (watch.length > 0) {
+			lines.push(
+				...renderScenarioSection(
+					'Notable movement',
+					'— large gap, no statistical flag',
+					watch,
+					false,
+					failedIndex,
+				),
+			);
+		}
+		if (imps.length > 0) {
+			lines.push(...renderScenarioSection('Improvements', '', imps, true));
+		}
+
+		if (renderedAnyTable) {
+			lines.push(
+				"_p = Fisher's exact one-sided p-value. Lower = stronger evidence of a real change._",
+			);
+			lines.push('');
+		}
+
+		// Always render the breakdown when comparison data is available — the
+		// renderer drops 0/0 rows itself, so empty categories don't pollute
+		// the output but the reader still sees the full taxonomy of what's
+		// tracked.
+		lines.push(...renderFailureCategorySection(comparison.failureCategories));
+	}
+
+	lines.push(...renderPerTestCaseDetails(evaluation, options.slugByTestCase));
+
+	if (comparison) {
+		const otherFindings = renderOtherFindings(comparison);
+		if (otherFindings.length > 0) lines.push(...otherFindings);
+	}
+
+	const failureDetails = renderFailureDetails(evaluation, options.slugByTestCase);
+	if (failureDetails.length > 0) lines.push(...failureDetails);
+
+	return lines.join('\n');
+}
+
+function formatHeading(commitSha?: string): string {
+	const sha = commitSha ? ` — \`${commitSha.slice(0, 8)}\`` : '';
+	return `### Instance AI Workflow Eval${sha}`;
+}
+
+function formatTopAlert(outcome?: ComparisonOutcome): string {
+	if (!outcome) {
+		return ['> [!NOTE]', '> No baseline comparison ran (LangSmith disabled for this run).'].join(
+			'\n',
+		);
+	}
+
+	if (outcome.kind === 'no_baseline') {
+		return [
+			'> [!NOTE]',
+			'> No baseline configured — comparison skipped. Run the eval with `--experiment-name instance-ai-baseline` on master to create one.',
+		].join('\n');
+	}
+	if (outcome.kind === 'self_baseline') {
+		return [
+			'> [!NOTE]',
+			`> This run is the baseline (\`${outcome.experimentName}\`) — nothing to compare against.`,
+		].join('\n');
+	}
+	if (outcome.kind === 'fetch_failed') {
+		return [
+			'> [!WARNING]',
+			`> Regression detection did not run — baseline fetch failed: ${outcome.error}`,
+		].join('\n');
+	}
+
+	const comparison = outcome.result;
+	const hard = hardRegressions(comparison).length;
+	const soft = softRegressions(comparison).length;
+	const watch = watchList(comparison).length;
+	const imps = improvements(comparison).length;
+	const stable = countByVerdict(comparison, 'stable');
+
+	const aggDelta = comparison.aggregate.delta * 100;
+	const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
+
+	// Always include all five tier counts so readers see what's being tracked,
+	// not just what's > 0. The hard count is bolded when nonzero for emphasis.
+	const summary = [
+		hard > 0 ? `**${hard} regression${hard === 1 ? '' : 's'}**` : '0 regressions',
+		`${soft} soft`,
+		`${watch} notable`,
+		`${imps} improvement${imps === 1 ? '' : 's'}`,
+		`${stable} stable`,
+	].join(', ');
+
+	let icon: string;
+	let alertKind: 'CAUTION' | 'WARNING' | 'NOTE' | 'TIP';
+
+	if (hard > 0) {
+		icon = '🔴';
+		alertKind = 'CAUTION';
+	} else if (soft > 0) {
+		icon = '🟡';
+		alertKind = 'WARNING';
+	} else if (watch > 0) {
+		icon = '🔵';
+		alertKind = 'NOTE';
+	} else {
+		icon = '🟢';
+		alertKind = 'TIP';
+	}
+
+	return `> [!${alertKind}]\n> ${icon} ${summary}. Pass rate ${aggDeltaText} vs master.`;
+}
+
+function formatAggregateBlock(
+	evaluation: MultiRunEvaluation,
+	comparison?: ComparisonResult,
+): string {
+	if (!comparison) {
+		const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
+		const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
+		const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
+		const rate = total > 0 ? (passed / total) * 100 : 0;
+		return `**Aggregate**: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`;
+	}
+
+	const { aggregate } = comparison;
+	const delta = aggregate.delta * 100;
+	const sign = delta >= 0 ? '+' : '';
+	const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
+
+	const baselineN = inferBaselineN(comparison);
+	const sampleLine = baselineN
+		? `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) vs N=${baselineN} (baseline) · baseline: \`${comparison.baseline.experimentName}\`_`
+		: `_${aggregate.intersectionSize} scenarios · N=${evaluation.totalRuns} (PR) · baseline: \`${comparison.baseline.experimentName}\`_`;
+
+	const partial = comparison.baselineOnly.length + comparison.prOnly.length;
+	const partialNote =
+		partial > 0
+			? `\n_Partial: ${[
+					comparison.baselineOnly.length > 0
+						? `${comparison.baselineOnly.length} baseline scenarios not run by PR`
+						: null,
+					comparison.prOnly.length > 0
+						? `${comparison.prOnly.length} PR scenarios have no baseline data (added since baseline captured)`
+						: null,
+				]
+					.filter((s) => s !== null)
+					.join(', ')}._`
+			: '';
+
+	return [
+		`**Aggregate**: ${pct(aggregate.prAggregatePassRate)}% PR vs ${pct(aggregate.baselineAggregatePassRate)}% baseline — **${sign}${delta.toFixed(1)}pp${arrow}**`,
+		sampleLine + partialNote,
+	].join('\n');
+}
+
+function renderScenarioSection(
+	heading: string,
+	subtitle: string,
+	scenarios: ScenarioComparison[],
+	withPValue: boolean,
+	failedIndex?: FailedRunsBySlug,
+): string[] {
+	const lines: string[] = [];
+	const headingLine = subtitle
+		? `#### ${heading} (${scenarios.length}) ${subtitle}`
+		: `#### ${heading} (${scenarios.length})`;
+	lines.push(headingLine);
+	lines.push('');
+	if (withPValue) {
+		lines.push('| Scenario | PR | Baseline | Δ | p |');
+		lines.push('|---|---|---|---|---|');
+	} else {
+		lines.push('| Scenario | PR | Baseline | Δ |');
+		lines.push('|---|---|---|---|');
+	}
+	for (const s of scenarios) {
+		const cells = [
+			`\`${s.testCaseFile}/${s.scenarioName}\``,
+			formatRateCell(s.prPasses, s.prTotal),
+			formatRateCell(s.baselinePasses, s.baselineTotal),
+			formatDeltaCell(s.delta),
+		];
+		if (withPValue) {
+			const p = s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft;
+			cells.push(p.toFixed(3));
+		}
+		lines.push(`| ${cells.join(' | ')} |`);
+	}
+	lines.push('');
+
+	// Per-scenario failure breakdown — one collapsible per row that had failed
+	// PR runs. Lets the reader drill into each flagged scenario without
+	// hunting through a separate "Failure details" section.
+	if (failedIndex) {
+		for (const s of scenarios) {
+			const failedRuns = failedIndex.get(`${s.testCaseFile}/${s.scenarioName}`) ?? [];
+			if (failedRuns.length === 0) continue;
+			lines.push(...renderScenarioFailureBreakdown(s, failedRuns));
+		}
+	}
+
+	return lines;
+}
+
+function renderScenarioFailureBreakdown(
+	s: ScenarioComparison,
+	failedRuns: FailedRunDetail[],
+): string[] {
+	const slug = `${s.testCaseFile}/${s.scenarioName}`;
+	const categoryMix = summarizeCategories(failedRuns);
+	const summaryParts = [`${failedRuns.length} of ${s.prTotal} failed`];
+	if (categoryMix) summaryParts.push(categoryMix);
+
+	const lines: string[] = [];
+	lines.push(`<details><summary><code>${slug}</code> — ${summaryParts.join(' · ')}</summary>`);
+	lines.push('');
+	for (const fr of failedRuns) {
+		const tag = fr.category ? ` [${fr.category}]` : '';
+		lines.push(`> Run ${fr.runIndex}${tag}: ${fr.reasoning.slice(0, 300)}`);
+		lines.push('>');
+	}
+	// Drop the trailing empty quote line.
+	if (lines[lines.length - 1] === '>') lines.pop();
+	lines.push('');
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+function renderFailureCategorySection(categories: FailureCategoryComparison[]): string[] {
+	// Drop rows that are 0/0 on both sides — they carry no signal for the
+	// reader. Categories with non-zero count on either side are kept so the
+	// reader sees the full picture even if not "notable".
+	const rows = categories.filter((c) => c.prCount > 0 || c.baselineCount > 0);
+	if (rows.length === 0) return [];
+
+	const lines: string[] = [];
+	lines.push('#### Failure breakdown');
+	lines.push('');
+	lines.push('| Category | PR | Baseline | Δ | |');
+	lines.push('|---|---|---|---|---|');
+	for (const c of rows) {
+		const isNew = c.baselineCount === 0 && c.prCount > 0;
+		const label = isNew ? `\`${c.category}\` 🆕` : `\`${c.category}\``;
+		const delta = c.delta * 100;
+		const sign = delta >= 0 ? '+' : '';
+		const arrow = delta > 0 ? ' ↑' : delta < 0 ? ' ↓' : '';
+		const notableMarker = c.notable ? '**notable**' : '';
+		lines.push(
+			`| ${label} | ${c.prCount} (${pct(c.prRate)}%) | ${c.baselineCount} (${pct(c.baselineRate)}%) | ${sign}${delta.toFixed(1)}pp${arrow} | ${notableMarker} |`,
+		);
+	}
+	lines.push('');
+	return lines;
+}
+
+function renderPerTestCaseDetails(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase?: Map<WorkflowTestCase, string>,
+): string[] {
+	const { totalRuns, testCases } = evaluation;
+	if (testCases.length === 0) return [];
+	const lines: string[] = [];
+	lines.push(`<details><summary>Per-test-case results (${testCases.length})</summary>`);
+	lines.push('');
+	const renderName = (tc: TestCaseAggregation): string => {
+		const slug = slugByTestCase?.get(tc.testCase);
+		return slug ? `\`${slug}\`` : `\`${tc.testCase.prompt.slice(0, 70)}\``;
+	};
+	if (totalRuns > 1) {
+		lines.push(`| Workflow | Built | pass@${totalRuns} | pass^${totalRuns} |`);
+		lines.push('|---|---|---|---|');
+		for (const tc of testCases) {
+			const meanPassAtK = tc.scenarios.length
+				? Math.round(
+						(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
+							tc.scenarios.length) *
+							100,
+					)
+				: 0;
+			const meanPassHatK = tc.scenarios.length
+				? Math.round(
+						(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
+							tc.scenarios.length) *
+							100,
+					)
+				: 0;
+			lines.push(
+				`| ${renderName(tc)} | ${tc.buildSuccessCount}/${totalRuns} | ${meanPassAtK}% | ${meanPassHatK}% |`,
+			);
+		}
+	} else {
+		lines.push('| Workflow | Built | Pass rate |');
+		lines.push('|---|---|---|');
+		for (const tc of testCases) {
+			const built = tc.runs[0]?.workflowBuildSuccess ? '✓' : '✗';
+			const passed = tc.scenarios.filter((sa) => sa.runs[0]?.success).length;
+			const total = tc.scenarios.length;
+			lines.push(`| ${renderName(tc)} | ${built} | ${passed}/${total} |`);
+		}
+	}
+	lines.push('');
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+function renderOtherFindings(comparison: ComparisonResult): string[] {
+	const stable = countByVerdict(comparison, 'stable');
+	const flaky = countByVerdict(comparison, 'unreliable_baseline');
+	const noData = countByVerdict(comparison, 'insufficient_data');
+	if (stable === 0 && flaky === 0 && noData === 0) return [];
+
+	const summaryParts: string[] = [];
+	if (flaky > 0) summaryParts.push(`${flaky} on flaky baseline`);
+	if (noData > 0) summaryParts.push(`${noData} no data`);
+	if (stable > 0) summaryParts.push(`${stable} stable`);
+	const summary = summaryParts.join(' · ');
+
+	const lines: string[] = [];
+	lines.push(`<details><summary>Other findings: ${summary}</summary>`);
+	lines.push('');
+
+	const stableScenarios = comparison.scenarios.filter((s) => s.verdict === 'stable');
+	const flakyScenarios = comparison.scenarios.filter((s) => s.verdict === 'unreliable_baseline');
+	const noDataScenarios = comparison.scenarios.filter((s) => s.verdict === 'insufficient_data');
+
+	if (flakyScenarios.length > 0) {
+		lines.push('**Confident drop on a flaky baseline (surfaced for visibility, not flagged):**');
+		lines.push('');
+		lines.push('| Scenario | PR | Baseline | Δ |');
+		lines.push('|---|---|---|---|');
+		for (const s of flakyScenarios) {
+			lines.push(
+				`| \`${s.testCaseFile}/${s.scenarioName}\` | ${formatRateCell(s.prPasses, s.prTotal)} | ${formatRateCell(s.baselinePasses, s.baselineTotal)} | ${formatDeltaCell(s.delta)} |`,
+			);
+		}
+		lines.push('');
+	}
+
+	if (noDataScenarios.length > 0) {
+		lines.push(
+			`**No data:** ${noDataScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ')}`,
+		);
+		lines.push('');
+	}
+
+	if (stableScenarios.length > 0) {
+		lines.push(`**Stable (${stableScenarios.length}):**`);
+		lines.push(
+			stableScenarios.map((s) => `\`${s.testCaseFile}/${s.scenarioName}\``).join(', ') + '.',
+		);
+		lines.push('');
+	}
+
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+function renderFailureDetails(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase?: Map<WorkflowTestCase, string>,
+): string[] {
+	const failed: Array<{
+		tc: WorkflowTestCaseResult;
+		fileSlug: string | undefined;
+		scenarioName: string;
+		failedRuns: Array<{ category?: string; reasoning: string }>;
+	}> = [];
+	for (const tc of evaluation.testCases) {
+		const fileSlug = slugByTestCase?.get(tc.testCase);
+		for (const sa of tc.scenarios) {
+			const failedRuns = sa.runs
+				.filter((r) => !r.success)
+				.map((r) => ({ category: r.failureCategory, reasoning: r.reasoning }));
+			if (failedRuns.length > 0) {
+				failed.push({ tc: tc.runs[0], fileSlug, scenarioName: sa.scenario.name, failedRuns });
+			}
+		}
+	}
+	if (failed.length === 0) return [];
+
+	const lines: string[] = [];
+	lines.push('<details><summary>Failure details</summary>');
+	lines.push('');
+	for (const { tc, fileSlug, scenarioName, failedRuns } of failed) {
+		const slug = fileSlug
+			? `${fileSlug}/${scenarioName}`
+			: `${tc.testCase.prompt.slice(0, 50).trim()} / ${scenarioName}`;
+		lines.push(`**\`${slug}\`** — ${failedRuns.length} failed`);
+		for (const fr of failedRuns) {
+			const tag = fr.category ? ` [${fr.category}]` : '';
+			lines.push(`> Run${tag}: ${fr.reasoning.slice(0, 200)}`);
+		}
+		lines.push('');
+	}
+	lines.push('</details>');
+	lines.push('');
+	return lines;
+}
+
+// ---------------------------------------------------------------------------
+// Per-scenario failure lookup
+// ---------------------------------------------------------------------------
+//
+// The comparison carries per-scenario counts (passed / total) but not the
+// underlying reasoning text. The evaluation has the reasoning, but keys
+// testCases by reference identity — not by the `testCaseFile` slug used in
+// the comparison. The slug map (built in cli/index.ts where the file slugs
+// are first known) bridges the two so the lookup is deterministic. Without
+// it we'd have to disambiguate by scenarioName alone, which collides on
+// reused names (`happy-path` shows up across most workflows).
+
+interface FailedRunDetail {
+	category?: string;
+	reasoning: string;
+	runIndex: number; // 1-based for display
+}
+
+type FailedRunsBySlug = Map<string, FailedRunDetail[]>;
+
+function buildFailedRunsIndex(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase: Map<WorkflowTestCase, string>,
+): FailedRunsBySlug {
+	const map: FailedRunsBySlug = new Map();
+	for (const tc of evaluation.testCases) {
+		const fileSlug = slugByTestCase.get(tc.testCase);
+		if (!fileSlug) continue; // testCase not in the slug map — skip rather than misattribute
+		for (const sa of tc.scenarios) {
+			const failedRuns: FailedRunDetail[] = [];
+			sa.runs.forEach((r, i) => {
+				if (!r.success) {
+					failedRuns.push({
+						category: r.failureCategory,
+						reasoning: r.reasoning,
+						runIndex: i + 1,
+					});
+				}
+			});
+			if (failedRuns.length > 0) {
+				map.set(`${fileSlug}/${sa.scenario.name}`, failedRuns);
+			}
+		}
+	}
+	return map;
+}
+
+function summarizeCategories(failedRuns: FailedRunDetail[]): string | undefined {
+	const counts = new Map<string, number>();
+	for (const fr of failedRuns) {
+		if (fr.category) counts.set(fr.category, (counts.get(fr.category) ?? 0) + 1);
+	}
+	if (counts.size === 0) return undefined;
+	return [...counts.entries()]
+		.sort((a, b) => b[1] - a[1])
+		.map(([cat, n]) => `${n}× ${cat}`)
+		.join(', ');
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function pct(rate: number): string {
+	return (rate * 100).toFixed(1);
+}
+
+function formatRateCell(passes: number, total: number): string {
+	const rate = total > 0 ? Math.round((passes / total) * 100) : 0;
+	return `${passes}/${total} (${rate}%)`;
+}
+
+function formatDeltaCell(delta: number): string {
+	const pp = delta * 100;
+	const sign = pp >= 0 ? '+' : '';
+	const arrow = pp > 0 ? ' ↑' : pp < 0 ? ' ↓' : '';
+	return `${sign}${pp.toFixed(0)}pp${arrow}`;
+}
+
+function countByVerdict(
+	comparison: ComparisonResult,
+	verdict: ScenarioComparison['verdict'],
+): number {
+	return comparison.scenarios.filter((s) => s.verdict === verdict).length;
+}
+
+/** Best-effort N=baseline iteration count. The comparison only carries trial
+ *  totals per scenario; we infer N from the most-common scenario total since
+ *  the baseline runs every scenario the same number of times. */
+function inferBaselineN(comparison: ComparisonResult): number | undefined {
+	const totals = comparison.scenarios
+		.filter((s) => s.baselineTotal > 0)
+		.map((s) => s.baselineTotal);
+	if (totals.length === 0) return undefined;
+	const counts = new Map<number, number>();
+	for (const t of totals) counts.set(t, (counts.get(t) ?? 0) + 1);
+	let best = totals[0];
+	let bestCount = 0;
+	for (const [n, c] of counts) {
+		if (c > bestCount) {
+			best = n;
+			bestCount = c;
+		}
+	}
+	return best;
+}
+
+// ---------------------------------------------------------------------------
+// Terminal renderer: aligned plain text for the eval CLI's end-of-run print.
+// ---------------------------------------------------------------------------
+
+const TERMINAL_INDENT = '  ';
+const TERMINAL_TABLE_INDENT = '    ';
+
+export function formatComparisonTerminal(
+	evaluation: MultiRunEvaluation,
+	outcome?: ComparisonOutcome,
+	options: FormatOptions = {},
+): string {
+	const lines: string[] = [];
+	const comparison = outcome?.kind === 'ok' ? outcome.result : undefined;
+
+	const titleSuffix = options.commitSha ? ` — ${options.commitSha.slice(0, 8)}` : '';
+	const title = `Instance AI Workflow Eval${titleSuffix}`;
+	lines.push(title);
+	lines.push('═'.repeat(title.length));
+
+	lines.push(TERMINAL_INDENT + formatTerminalVerdictLine(outcome));
+	lines.push('');
+
+	lines.push(...formatTerminalAggregate(evaluation, comparison));
+	lines.push('');
+
+	lines.push(...formatTerminalPerTestCase(evaluation, options.slugByTestCase));
+
+	if (comparison) {
+		const hard = hardRegressions(comparison);
+		const soft = softRegressions(comparison);
+		const watch = watchList(comparison);
+		const imps = improvements(comparison);
+
+		if (hard.length > 0) {
+			lines.push(
+				TERMINAL_INDENT +
+					'REGRESSIONS  (high-confidence: large drop on a reliable scenario, unlikely noise)',
+			);
+			lines.push(formatTerminalScenarioTable(hard, true));
+			lines.push('');
+		}
+		if (soft.length > 0) {
+			lines.push(
+				TERMINAL_INDENT +
+					'SOFT REGRESSIONS  (likely natural variance — investigate if related to your changes)',
+			);
+			lines.push(formatTerminalScenarioTable(soft, true));
+			lines.push('');
+		}
+		if (watch.length > 0) {
+			lines.push(TERMINAL_INDENT + 'NOTABLE MOVEMENT  (large gap, no statistical flag)');
+			lines.push(formatTerminalScenarioTable(watch, false));
+			lines.push('');
+		}
+		if (imps.length > 0) {
+			lines.push(TERMINAL_INDENT + 'IMPROVEMENTS');
+			lines.push(formatTerminalScenarioTable(imps, true));
+			lines.push('');
+		}
+
+		// Always render the breakdown when comparison data is available — same
+		// rationale as the markdown side. The terminal table drops 0/0 rows
+		// itself.
+		const breakdownRows = comparison.failureCategories.filter(
+			(c) => c.prCount > 0 || c.baselineCount > 0,
+		);
+		if (breakdownRows.length > 0) {
+			lines.push(TERMINAL_INDENT + 'failure breakdown');
+			lines.push(formatTerminalCategoryTable(breakdownRows));
+			lines.push('');
+		}
+
+		// Stable count is already in the verdict line; surface only the rarer
+		// outcomes here.
+		const flaky = countByVerdict(comparison, 'unreliable_baseline');
+		const noData = countByVerdict(comparison, 'insufficient_data');
+		const otherParts: string[] = [];
+		if (flaky > 0) otherParts.push(`${flaky} on flaky baseline`);
+		if (noData > 0) otherParts.push(`${noData} no data`);
+		if (otherParts.length > 0) {
+			lines.push(TERMINAL_INDENT + 'other: ' + otherParts.join(' · '));
+		}
+	}
+
+	return lines.join('\n');
+}
+
+function formatTerminalVerdictLine(outcome?: ComparisonOutcome): string {
+	if (!outcome) return '▶ No baseline comparison ran (LangSmith disabled).';
+	if (outcome.kind === 'no_baseline') {
+		return '▶ No baseline configured — comparison skipped.';
+	}
+	if (outcome.kind === 'self_baseline') {
+		return `▶ This run is the baseline (${outcome.experimentName}) — nothing to compare.`;
+	}
+	if (outcome.kind === 'fetch_failed') {
+		return `▶ Regression detection did not run — baseline fetch failed: ${outcome.error}`;
+	}
+
+	const comparison = outcome.result;
+	const hard = hardRegressions(comparison).length;
+	const soft = softRegressions(comparison).length;
+	const watch = watchList(comparison).length;
+	const imps = improvements(comparison).length;
+	const stable = countByVerdict(comparison, 'stable');
+
+	const aggDelta = comparison.aggregate.delta * 100;
+	const aggDeltaText = `${aggDelta >= 0 ? '+' : ''}${aggDelta.toFixed(1)}pp`;
+
+	const summary = [
+		`${hard} regression${hard === 1 ? '' : 's'}`,
+		`${soft} soft`,
+		`${watch} notable`,
+		`${imps} improvement${imps === 1 ? '' : 's'}`,
+		`${stable} stable`,
+	].join(', ');
+
+	return `▶ ${summary}. Pass rate ${aggDeltaText} vs master.`;
+}
+
+function formatTerminalAggregate(
+	evaluation: MultiRunEvaluation,
+	comparison?: ComparisonResult,
+): string[] {
+	const lines: string[] = [];
+	if (!comparison) {
+		const allScenarios = evaluation.testCases.flatMap((tc) => tc.scenarios);
+		const passed = allScenarios.reduce((sum, sa) => sum + sa.passCount, 0);
+		const total = allScenarios.reduce((sum, sa) => sum + sa.runs.length, 0);
+		const rate = total > 0 ? (passed / total) * 100 : 0;
+		lines.push(
+			TERMINAL_INDENT +
+				`Aggregate: ${rate.toFixed(1)}% pass (${passed}/${total} trials, ${allScenarios.length} scenarios × N=${evaluation.totalRuns})`,
+		);
+		return lines;
+	}
+
+	const { aggregate } = comparison;
+	const baselineN = inferBaselineN(comparison);
+	const aggDelta = aggregate.delta * 100;
+	const sign = aggDelta >= 0 ? '+' : '';
+	const arrow = aggDelta > 0 ? ' ↑' : aggDelta < 0 ? ' ↓' : '';
+	lines.push(TERMINAL_INDENT + `Aggregate (${aggregate.intersectionSize} scenarios)`);
+	lines.push(
+		TERMINAL_INDENT +
+			`  PR        ${pct(aggregate.prAggregatePassRate)}%   (N=${evaluation.totalRuns})`,
+	);
+	if (baselineN !== undefined) {
+		lines.push(
+			TERMINAL_INDENT +
+				`  baseline  ${pct(aggregate.baselineAggregatePassRate)}%   (N=${baselineN})`,
+		);
+	} else {
+		lines.push(TERMINAL_INDENT + `  baseline  ${pct(aggregate.baselineAggregatePassRate)}%`);
+	}
+	lines.push(TERMINAL_INDENT + `  Δ         ${sign}${aggDelta.toFixed(1)}pp${arrow}`);
+
+	if (comparison.baselineOnly.length > 0 || comparison.prOnly.length > 0) {
+		const partialParts: string[] = [];
+		if (comparison.baselineOnly.length > 0)
+			partialParts.push(`${comparison.baselineOnly.length} baseline scenarios not run by PR`);
+		if (comparison.prOnly.length > 0)
+			partialParts.push(`${comparison.prOnly.length} PR scenarios have no baseline data`);
+		lines.push(TERMINAL_INDENT + `  partial: ${partialParts.join(', ')}`);
+	}
+
+	return lines;
+}
+
+function formatTerminalPerTestCase(
+	evaluation: MultiRunEvaluation,
+	slugByTestCase?: Map<WorkflowTestCase, string>,
+): string[] {
+	const { totalRuns, testCases } = evaluation;
+	if (testCases.length === 0) return [];
+	const lines: string[] = [];
+	const heading = `Per-test-case results (${testCases.length})`;
+	lines.push(TERMINAL_INDENT + heading);
+
+	const nameOf = (tc: TestCaseAggregation, max: number): string => {
+		const slug = slugByTestCase?.get(tc.testCase);
+		return slug ?? tc.testCase.prompt.slice(0, max);
+	};
+
+	if (totalRuns > 1) {
+		const rows = testCases.map((tc) => {
+			const meanPassAtK =
+				tc.scenarios.length > 0
+					? Math.round(
+							(tc.scenarios.reduce((sum, sa) => sum + (sa.passAtK[totalRuns - 1] ?? 0), 0) /
+								tc.scenarios.length) *
+								100,
+						)
+					: 0;
+			const meanPassHatK =
+				tc.scenarios.length > 0
+					? Math.round(
+							(tc.scenarios.reduce((sum, sa) => sum + (sa.passHatK[totalRuns - 1] ?? 0), 0) /
+								tc.scenarios.length) *
+								100,
+						)
+					: 0;
+			return {
+				name: nameOf(tc, 60),
+				builds: `${tc.buildSuccessCount}/${totalRuns}`,
+				passAtK: `${meanPassAtK}%`,
+				passHatK: `${meanPassHatK}%`,
+			};
+		});
+		const nameW = maxWidth(
+			rows.map((r) => r.name),
+			'workflow',
+		);
+		const buildsW = maxWidth(
+			rows.map((r) => r.builds),
+			'builds',
+		);
+		const atKHeader = `pass@${totalRuns}`;
+		const hatKHeader = `pass^${totalRuns}`;
+		const atKW = maxWidth(
+			rows.map((r) => r.passAtK),
+			atKHeader,
+		);
+		const hatKW = maxWidth(
+			rows.map((r) => r.passHatK),
+			hatKHeader,
+		);
+		lines.push(
+			TERMINAL_TABLE_INDENT +
+				`${'workflow'.padEnd(nameW)}  ${'builds'.padEnd(buildsW)}  ${atKHeader.padStart(atKW)}  ${hatKHeader.padStart(hatKW)}`,
+		);
+		lines.push(
+			TERMINAL_TABLE_INDENT +
+				`${'─'.repeat(nameW)}  ${'─'.repeat(buildsW)}  ${'─'.repeat(atKW)}  ${'─'.repeat(hatKW)}`,
+		);
+		for (const r of rows) {
+			lines.push(
+				TERMINAL_TABLE_INDENT +
+					`${r.name.padEnd(nameW)}  ${r.builds.padEnd(buildsW)}  ${r.passAtK.padStart(atKW)}  ${r.passHatK.padStart(hatKW)}`,
+			);
+		}
+	} else {
+		for (const tc of testCases) {
+			const r = tc.runs[0];
+			const buildStatus = r.workflowBuildSuccess ? 'BUILT' : 'BUILD FAILED';
+			lines.push('');
+			lines.push(TERMINAL_INDENT + `${nameOf(tc, 70)}…`);
+			lines.push(TERMINAL_INDENT + `  ${buildStatus}${r.workflowId ? ` (${r.workflowId})` : ''}`);
+			if (r.buildError) lines.push(TERMINAL_INDENT + `  error: ${r.buildError.slice(0, 200)}`);
+			for (const sa of tc.scenarios) {
+				const sr = sa.runs[0];
+				const status = sr.success ? 'PASS' : 'FAIL';
+				const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
+				lines.push(TERMINAL_INDENT + `  ${status}  ${sr.scenario.name}${category}`);
+				if (!sr.success) {
+					const errs = sr.evalResult?.errors ?? [];
+					if (errs.length > 0) {
+						lines.push(TERMINAL_INDENT + `        error: ${errs.join('; ').slice(0, 200)}`);
+					}
+					lines.push(TERMINAL_INDENT + `        diagnosis: ${sr.reasoning.slice(0, 200)}`);
+				}
+			}
+		}
+	}
+	lines.push('');
+	return lines;
+}
+
+function formatTerminalScenarioTable(scenarios: ScenarioComparison[], withPValue: boolean): string {
+	const names = scenarios.map((s) => `${s.testCaseFile}/${s.scenarioName}`);
+	const prCells = scenarios.map((s) => `${s.prPasses}/${s.prTotal}`);
+	const baseCells = scenarios.map((s) => `${s.baselinePasses}/${s.baselineTotal}`);
+	const deltaCells = scenarios.map((s) => {
+		const d = s.delta * 100;
+		const sign = d >= 0 ? '+' : '';
+		const arrow = d > 0 ? ' ↑' : d < 0 ? ' ↓' : '';
+		return `${sign}${d.toFixed(0)}pp${arrow}`;
+	});
+	const pCells = withPValue
+		? scenarios.map((s) => (s.verdict === 'improvement' ? s.pValueRight : s.pValueLeft).toFixed(3))
+		: [];
+
+	const nameW = maxWidth(names, 'scenario');
+	const prW = maxWidth(prCells, 'PR');
+	const baseW = maxWidth(baseCells, 'baseline');
+	const deltaW = maxWidth(deltaCells, 'Δ');
+	const pW = withPValue ? maxWidth(pCells, 'p') : 0;
+
+	const headers = [
+		'scenario'.padEnd(nameW),
+		'PR'.padEnd(prW),
+		'baseline'.padEnd(baseW),
+		'Δ'.padEnd(deltaW),
+	];
+	if (withPValue) headers.push('p'.padEnd(pW));
+	const widths = withPValue ? [nameW, prW, baseW, deltaW, pW] : [nameW, prW, baseW, deltaW];
+	const sep = widths.map((w) => '─'.repeat(w)).join('  ');
+
+	const rows = scenarios.map((_, i) => {
+		const cells = [
+			names[i].padEnd(nameW),
+			prCells[i].padEnd(prW),
+			baseCells[i].padEnd(baseW),
+			deltaCells[i].padEnd(deltaW),
+		];
+		if (withPValue) cells.push(pCells[i].padEnd(pW));
+		return TERMINAL_TABLE_INDENT + cells.join('  ');
+	});
+
+	return [TERMINAL_TABLE_INDENT + headers.join('  '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
+		'\n',
+	);
+}
+
+function formatTerminalCategoryTable(cats: FailureCategoryComparison[]): string {
+	const names = cats.map((c) => {
+		const isNew = c.baselineCount === 0 && c.prCount > 0;
+		return c.category + (isNew ? ' 🆕' : '');
+	});
+	const prCells = cats.map((c) => `${c.prCount} (${pct(c.prRate)}%)`);
+	const baseCells = cats.map((c) => `${c.baselineCount} (${pct(c.baselineRate)}%)`);
+	const deltaCells = cats.map((c) => {
+		const d = c.delta * 100;
+		const sign = d >= 0 ? '+' : '';
+		return `${sign}${d.toFixed(1)}pp`;
+	});
+
+	const nameW = maxWidth(names, 'category');
+	const prW = maxWidth(prCells, 'PR');
+	const baseW = maxWidth(baseCells, 'baseline');
+
+	const headers = ['category'.padEnd(nameW), 'PR'.padEnd(prW), 'baseline'.padEnd(baseW), 'Δ'];
+	const sep = [nameW, prW, baseW, maxWidth(deltaCells, 'Δ')].map((w) => '─'.repeat(w)).join('  ');
+
+	const rows = cats.map(
+		(_, i) =>
+			TERMINAL_TABLE_INDENT +
+			[
+				names[i].padEnd(nameW),
+				prCells[i].padEnd(prW),
+				baseCells[i].padEnd(baseW),
+				deltaCells[i],
+			].join('  '),
+	);
+
+	return [TERMINAL_TABLE_INDENT + headers.join('  '), TERMINAL_TABLE_INDENT + sep, ...rows].join(
+		'\n',
+	);
+}
+
+function maxWidth(values: string[], header: string): number {
+	return values.reduce((m, v) => Math.max(m, v.length), header.length);
+}
--- a/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts
+++ b/packages/@n8n/instance-ai/evaluations/comparison/statistics.ts
@ -0,0 +1,304 @@
+// ---------------------------------------------------------------------------
+// Decides whether one scenario's pass rate is meaningfully worse than
+// another, at the small sample sizes evals run at (N=3 typically).
+//
+// Public surface:
+//   - classifyScenario(prPasses, prTotal, basePasses, baseTotal) — the verdict
+//   - wilsonInterval(passes, total) — confidence band for a pass rate, used
+//     for the headline aggregate
+//
+// The implementation uses Fisher's exact test and the Wilson score interval
+// under the hood; both are standard small-sample statistics. You don't need
+// to know either to use the public API.
+// ---------------------------------------------------------------------------
+import { strict as assert } from 'node:assert';
+
+// ---------------------------------------------------------------------------
+// Fisher's exact test (one-sided)
+//
+// Given a 2×2 table of pass/fail counts for PR vs baseline, returns the
+// probability of seeing a gap at least as bad as the observed one if the two
+// groups actually had the same pass rate. Small return value ⇒ strong
+// evidence the PR is worse.
+// ---------------------------------------------------------------------------
+
+const logFactorialCache: number[] = [0, 0];
+
+function logFactorial(n: number): number {
+	for (let i = logFactorialCache.length; i <= n; i++) {
+		logFactorialCache.push(logFactorialCache[i - 1] + Math.log(i));
+	}
+	return logFactorialCache[n];
+}
+
+function logBinomial(n: number, k: number): number {
+	if (k < 0 || k > n) return -Infinity;
+	return logFactorial(n) - logFactorial(k) - logFactorial(n - k);
+}
+
+function hypergeomPmf(nPasses: number, nFails: number, nDrawn: number, k: number): number {
+	const total = nPasses + nFails;
+	if (k < Math.max(0, nDrawn - nFails) || k > Math.min(nDrawn, nPasses)) return 0;
+	return Math.exp(
+		logBinomial(nPasses, k) + logBinomial(nFails, nDrawn - k) - logBinomial(total, nDrawn),
+	);
+}
+
+/**
+ * One-sided Fisher's exact test (left tail). Returns the probability that
+ * PR's pass count would be at most `a` if PR and baseline shared the same
+ * underlying pass rate. Small value ⇒ PR is significantly worse.
+ *
+ * 2×2 table:
+ *
+ *              passed   failed
+ *   PR        |   a    |   b   |
+ *   Baseline  |   c    |   d   |
+ *
+ * Returns 1 (no information) when either side has no trials, or when all
+ * trials passed or all failed.
+ */
+export function fishersExactOneSidedLeft(a: number, b: number, c: number, d: number): number {
+	const inputs = [a, b, c, d];
+	for (const v of inputs) {
+		assert(
+			Number.isInteger(v) && v >= 0,
+			'fishersExactOneSidedLeft requires non-negative integers',
+		);
+	}
+
+	const nPr = a + b;
+	const nBase = c + d;
+	const nPasses = a + c;
+	const nFails = b + d;
+
+	if (nPr === 0 || nBase === 0) return 1;
+	if (nPasses === 0 || nFails === 0) return 1;
+
+	let pValue = 0;
+	const kMax = Math.min(a, nPasses);
+	for (let k = 0; k <= kMax; k++) {
+		pValue += hypergeomPmf(nPasses, nFails, nPr, k);
+	}
+	// Clamp to [0, 1] — accumulated FP error can push the sum slightly past 1.
+	return Math.min(1, Math.max(0, pValue));
+}
+
+// ---------------------------------------------------------------------------
+// Wilson score interval (95% confidence)
+//
+// Returns a confidence band for a pass rate that behaves well at small N and
+// at extreme rates (close to 0 or 1) — both common in our evals. Used for
+// the headline aggregate band only; classification doesn't need it.
+// ---------------------------------------------------------------------------
+
+// Standard z-score for a 95% confidence interval. We only ever use 95%, so
+// the value is inlined rather than parameterised.
+const Z_95 = 1.96;
+
+export function wilsonInterval(passes: number, total: number): { lower: number; upper: number } {
+	assert(
+		Number.isInteger(passes) && passes >= 0,
+		'wilsonInterval: passes must be a non-negative integer',
+	);
+	assert(
+		Number.isInteger(total) && total >= 0,
+		'wilsonInterval: total must be a non-negative integer',
+	);
+	assert(passes <= total, 'wilsonInterval: passes cannot exceed total');
+
+	if (total === 0) return { lower: 0, upper: 1 };
+
+	const p = passes / total;
+	const z2 = Z_95 * Z_95;
+	const denom = 1 + z2 / total;
+	const center = (p + z2 / (2 * total)) / denom;
+	const halfWidth = (Z_95 * Math.sqrt((p * (1 - p)) / total + z2 / (4 * total * total))) / denom;
+	return {
+		lower: Math.max(0, center - halfWidth),
+		upper: Math.min(1, center + halfWidth),
+	};
+}
+
+// ---------------------------------------------------------------------------
+// Per-scenario classification
+//
+// Three flag tiers, evaluated in order of strictness:
+//
+//   hard_regression  — high-confidence drop on a reliable baseline.
+//                      Gating-grade.
+//   soft_regression  — looser bar; investigate, not gating.
+//   watch            — moved noticeably but didn't pass either flag tier.
+//                      Pure visibility.
+//
+// Improvements use the hard tier (we don't surface borderline improvements;
+// they tend to be noise in the positive direction).
+// ---------------------------------------------------------------------------
+
+export type ScenarioVerdict =
+	| 'hard_regression' // PR is confidently worse, baseline was reliable
+	| 'soft_regression' // looser bar — worth investigating, not high-confidence
+	| 'watch' // moved enough to surface but no flag tier triggered
+	| 'improvement' // PR is significantly better
+	| 'stable' // no meaningful change
+	| 'unreliable_baseline' // confident drop but baseline was too flaky to trust
+	| 'insufficient_data'; // either side had zero trials
+
+export interface ScenarioClassification {
+	verdict: ScenarioVerdict;
+	/** PR pass rate (0..1) */
+	prPassRate: number;
+	/** Baseline pass rate (0..1) */
+	baselinePassRate: number;
+	/** PR rate − baseline rate, signed. Negative = PR worse. */
+	delta: number;
+	/** Probability the PR is at least this much worse by chance. Lower ⇒ stronger regression evidence. */
+	pValueLeft: number;
+	/** Probability the PR is at least this much better by chance. */
+	pValueRight: number;
+}
+
+export interface TierThresholds {
+	/** Flag only when the chance the gap happened by noise is below this. */
+	maxPValue: number;
+	/** Flag only when the absolute pass-rate gap is at least this large (0..1). */
+	minDelta: number;
+	/** Flag only when the baseline pass rate was at least this high (0..1). */
+	minBaselinePassRate: number;
+}
+
+export interface ClassifyOptions {
+	/** Hard-flag thresholds (most strict). Defaults: maxPValue=0.05, minDelta=0.30, minBaselinePassRate=0.70. */
+	hard?: Partial<TierThresholds>;
+	/** Soft-flag thresholds (looser). Defaults: maxPValue=0.20, minDelta=0.15, minBaselinePassRate=0.50. */
+	soft?: Partial<TierThresholds>;
+	/** Absolute pass-rate change required for a "watch" verdict regardless of significance. Default 0.35. */
+	watchDelta?: number;
+}
+
+const DEFAULT_HARD: TierThresholds = {
+	maxPValue: 0.05,
+	minDelta: 0.3,
+	minBaselinePassRate: 0.7,
+};
+const DEFAULT_SOFT: TierThresholds = {
+	maxPValue: 0.2,
+	minDelta: 0.15,
+	minBaselinePassRate: 0.5,
+};
+// Watch threshold: surface scenarios whose pass rate changed by at least 35pp
+// without reaching a flag tier. High enough that natural noise on rock-solid
+// scenarios (e.g. 2/3 vs 10/10 = −33pp) doesn't crowd the comment.
+const DEFAULT_WATCH_DELTA = 0.35;
+
+function meetsThreshold(
+	pValue: number,
+	delta: number,
+	baselineRate: number,
+	tier: TierThresholds,
+	direction: 'worse' | 'better',
+): boolean {
+	if (pValue >= tier.maxPValue) return false;
+	if (direction === 'worse') {
+		if (delta > -tier.minDelta) return false;
+		if (baselineRate < tier.minBaselinePassRate) return false;
+	} else {
+		if (delta < tier.minDelta) return false;
+		// Improvements skip the reliability gate — fixing flaky scenarios is a real win.
+	}
+	return true;
+}
+
+/**
+ * Classify a single scenario into one of seven verdicts. See ScenarioVerdict
+ * for the tier semantics.
+ *
+ * `options` exists for tests; production callers leave thresholds at defaults.
+ */
+export function classifyScenario(
+	prPasses: number,
+	prTotal: number,
+	baselinePasses: number,
+	baselineTotal: number,
+	options: ClassifyOptions = {},
+): ScenarioClassification {
+	const hard: TierThresholds = { ...DEFAULT_HARD, ...options.hard };
+	const soft: TierThresholds = { ...DEFAULT_SOFT, ...options.soft };
+	const watchDelta = options.watchDelta ?? DEFAULT_WATCH_DELTA;
+
+	const prPassRate = prTotal > 0 ? prPasses / prTotal : 0;
+	const baselinePassRate = baselineTotal > 0 ? baselinePasses / baselineTotal : 0;
+
+	if (prTotal === 0 || baselineTotal === 0) {
+		return {
+			verdict: 'insufficient_data',
+			prPassRate,
+			baselinePassRate,
+			delta: prPassRate - baselinePassRate,
+			pValueLeft: 1,
+			pValueRight: 1,
+		};
+	}
+
+	const a = prPasses;
+	const b = prTotal - prPasses;
+	const c = baselinePasses;
+	const d = baselineTotal - baselinePasses;
+
+	const pValueLeft = fishersExactOneSidedLeft(a, b, c, d);
+	const pValueRight = fishersExactOneSidedLeft(c, d, a, b);
+	const delta = prPassRate - baselinePassRate;
+
+	// Improvement (right tail) — single tier, hard thresholds only
+	if (meetsThreshold(pValueRight, delta, baselinePassRate, hard, 'better')) {
+		return { verdict: 'improvement', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
+	}
+
+	// Hard regression — passes all three hard gates
+	if (meetsThreshold(pValueLeft, delta, baselinePassRate, hard, 'worse')) {
+		return {
+			verdict: 'hard_regression',
+			prPassRate,
+			baselinePassRate,
+			delta,
+			pValueLeft,
+			pValueRight,
+		};
+	}
+
+	// Confident drop, but on a baseline too flaky to call a regression.
+	// Surface as `unreliable_baseline` so it's visible without being a flag.
+	if (
+		pValueLeft < hard.maxPValue &&
+		delta <= -hard.minDelta &&
+		baselinePassRate < hard.minBaselinePassRate
+	) {
+		return {
+			verdict: 'unreliable_baseline',
+			prPassRate,
+			baselinePassRate,
+			delta,
+			pValueLeft,
+			pValueRight,
+		};
+	}
+
+	// Soft regression — passes the looser gates
+	if (meetsThreshold(pValueLeft, delta, baselinePassRate, soft, 'worse')) {
+		return {
+			verdict: 'soft_regression',
+			prPassRate,
+			baselinePassRate,
+			delta,
+			pValueLeft,
+			pValueRight,
+		};
+	}
+
+	// Watch — meaningful movement but no flag fired. Pure visibility.
+	if (Math.abs(delta) >= watchDelta) {
+		return { verdict: 'watch', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
+	}
+
+	return { verdict: 'stable', prPassRate, baselinePassRate, delta, pValueLeft, pValueRight };
+}
--- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts
@ -28,7 +28,7 @@ import type {
 // Constants
 // ---------------------------------------------------------------------------

-const DEFAULT_TIMEOUT_MS = 600_000;
+const DEFAULT_TIMEOUT_MS = 900_000;
 const SSE_SETTLE_DELAY_MS = 200;
 const POLL_INTERVAL_MS = 500;
 const BACKGROUND_TASK_POLL_INTERVAL_MS = 2_000;
--- a/packages/@n8n/instance-ai/evaluations/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/index.ts
@ -39,3 +39,38 @@ export type {
 	ChecklistItem,
 	ChecklistResult,
 } from './types';
+
+// -- Comparison (regression detection) --
+export {
+	compareBuckets,
+	byVerdict,
+	improvements,
+	hardRegressions,
+	softRegressions,
+	watchList,
+} from './comparison/compare';
+export type {
+	ComparisonResult,
+	ScenarioComparison,
+	ScenarioCounts,
+	ExperimentBucket,
+	AggregateComparison,
+	FailureCategoryComparison,
+} from './comparison/compare';
+export {
+	classifyScenario,
+	fishersExactOneSidedLeft,
+	wilsonInterval,
+} from './comparison/statistics';
+export type {
+	ScenarioVerdict,
+	ScenarioClassification,
+	ClassifyOptions,
+	TierThresholds,
+} from './comparison/statistics';
+export { formatComparisonMarkdown, formatComparisonTerminal } from './comparison/format';
+export {
+	fetchBaselineBucket,
+	findLatestBaseline,
+	BASELINE_EXPERIMENT_PREFIX,
+} from './comparison/fetch-baseline';
--- a/packages/@n8n/instance-ai/src/agent/tests/computer-use-prompt.test.ts
+++ b/packages/@n8n/instance-ai/src/agent/tests/computer-use-prompt.test.ts
@ -0,0 +1,247 @@
+import { getComputerUsePrompt } from '../computer-use-prompt';
+
+describe('getComputerUsePrompt', () => {
+	describe('when localGateway is undefined', () => {
+		it('returns an empty string', () => {
+			expect(getComputerUsePrompt({ browserAvailable: undefined, localGateway: undefined })).toBe(
+				'',
+			);
+		});
+	});
+
+	describe('when Computer Use is disabled globally', () => {
+		it('returns an empty string', () => {
+			expect(
+				getComputerUsePrompt({
+					browserAvailable: undefined,
+					localGateway: { status: 'disabledGlobally' },
+				}),
+			).toBe('');
+		});
+	});
+
+	describe('when Computer Use has not been set up (disabled)', () => {
+		it('includes the Computer Use intro section', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disabled' },
+			});
+
+			expect(result).toContain('## Computer Use');
+		});
+
+		it('tells the agent not to use Computer Use tools', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disabled' },
+			});
+
+			expect(result).toContain('Do NOT attempt to use Computer Use tools');
+		});
+
+		it('provides UI setup instructions', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disabled' },
+			});
+
+			expect(result).toContain('Setup computer use');
+		});
+	});
+
+	describe('when Computer Use is disconnected', () => {
+		it('includes the Computer Use intro section', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disconnected' },
+			});
+
+			expect(result).toContain('## Computer Use');
+		});
+
+		it('tells the agent not to use Computer Use tools', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disconnected' },
+			});
+
+			expect(result).toContain('Do NOT attempt to use Computer Use tools');
+		});
+
+		it('provides UI connection instructions', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disconnected' },
+			});
+
+			expect(result).toContain('"Connect"');
+		});
+	});
+
+	describe('when Computer Use is connected with no capabilities enabled', () => {
+		it('reports that no capabilities are enabled', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'connected', capabilities: [] },
+			});
+
+			expect(result).toContain('did not enable any capabilities');
+		});
+
+		it('does not include the filesystem exploration section', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'connected', capabilities: [] },
+			});
+
+			expect(result).not.toContain('Filesystem Exploration');
+		});
+	});
+
+	describe('when Computer Use is connected with filesystem capability', () => {
+		it('includes the filesystem exploration guidance', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'connected', capabilities: ['filesystem'] },
+			});
+
+			expect(result).toContain('### Computer Use - Filesystem Exploration');
+			expect(result).toContain('start at depth 1');
+			expect(result).toContain('prefer `search` over browsing');
+			expect(result).toContain('read specific files rather than whole directories');
+		});
+	});
+
+	describe('when Computer Use is connected without filesystem capability', () => {
+		it('does not include the filesystem exploration section', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: true,
+				localGateway: { status: 'connected', capabilities: ['browser'] },
+			});
+
+			expect(result).not.toContain('Filesystem Exploration');
+		});
+	});
+
+	describe('when Computer Use is connected with browser available', () => {
+		it('includes the browser automation rules', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: true,
+				localGateway: { status: 'connected', capabilities: ['browser'] },
+			});
+
+			expect(result).toContain('### Computer Use - Browser Automation rules');
+		});
+
+		it('includes handoff instructions', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: true,
+				localGateway: { status: 'connected', capabilities: ['browser'] },
+			});
+
+			expect(result).toContain('end your turn');
+			expect(result).toContain('Authentication');
+			expect(result).toContain('CAPTCHAs');
+		});
+
+		it('includes the secrets guardrail', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: true,
+				localGateway: { status: 'connected', capabilities: ['browser'] },
+			});
+
+			expect(result).toContain('NEVER include passwords, API keys');
+		});
+	});
+
+	describe('when Computer Use is connected but browser is not available', () => {
+		it('includes the browser-disabled notice', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: false,
+				localGateway: { status: 'connected', capabilities: ['browser'] },
+			});
+
+			expect(result).toContain('Browser Automation (Disabled in Computer Use)');
+		});
+
+		it('does not include the full browser automation rules', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: false,
+				localGateway: { status: 'connected', capabilities: ['browser'] },
+			});
+
+			expect(result).not.toContain('end your turn');
+		});
+	});
+
+	describe('when Computer Use is connected with both filesystem and browser', () => {
+		it('includes both the filesystem exploration section and browser rules', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: true,
+				localGateway: { status: 'connected', capabilities: ['filesystem', 'browser'] },
+			});
+
+			expect(result).toContain('Filesystem Exploration');
+			expect(result).toContain('Browser Automation rules');
+		});
+	});
+
+	describe('proactive suggestion guidance', () => {
+		it('is included for a connected gateway', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: true,
+				localGateway: { status: 'connected', capabilities: ['browser'] },
+			});
+
+			expect(result).toContain('When to suggest or use Computer Use');
+		});
+
+		it('is included for a disconnected gateway', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disconnected' },
+			});
+
+			expect(result).toContain('When to suggest or use Computer Use');
+		});
+
+		it('is included for a disabled (not set up) gateway', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disabled' },
+			});
+
+			expect(result).toContain('When to suggest or use Computer Use');
+		});
+
+		it('is absent when localGateway is undefined', () => {
+			const result = getComputerUsePrompt({ browserAvailable: undefined, localGateway: undefined });
+
+			expect(result).not.toContain('When to suggest or use Computer Use');
+		});
+
+		it('is absent when Computer Use is disabled globally', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disabledGlobally' },
+			});
+
+			expect(result).not.toContain('When to suggest or use Computer Use');
+		});
+
+		it('lists all 7 use-case categories', () => {
+			const result = getComputerUsePrompt({
+				browserAvailable: undefined,
+				localGateway: { status: 'disconnected' },
+			});
+
+			expect(result).toContain('Credential / OAuth setup');
+			expect(result).toContain('Local file as context');
+			expect(result).toContain('Documentation / output to files');
+			expect(result).toContain('Authenticated web research');
+			expect(result).toContain('Form / frontend testing');
+			expect(result).toContain('Shell / environment');
+			expect(result).toContain('Platform migration');
+		});
+	});
+});
--- a/packages/@n8n/instance-ai/src/agent/tests/instance-agent.test.ts
+++ b/packages/@n8n/instance-ai/src/agent/tests/instance-agent.test.ts
@ -21,12 +21,6 @@ jest.mock('@mastra/core/processors', () => ({
 	}),
 }));

-jest.mock('@mastra/mcp', () => ({
-	MCPClient: jest.fn().mockImplementation(() => ({
-		listTools: jest.fn().mockResolvedValue({}),
-	})),
-}));
-
 jest.mock('../../memory/memory-config', () => ({
 	createMemory: jest.fn().mockReturnValue({}),
 }));
@ -53,10 +47,6 @@ jest.mock('../../tracing/langsmith-tracing', () => ({
 	mergeTraceRunInputs: jest.fn(),
 }));

-jest.mock('../sanitize-mcp-schemas', () => ({
-	sanitizeMcpToolSchemas: jest.fn((tools: Record<string, unknown>) => tools),
-}));
-
 jest.mock('../system-prompt', () => ({
 	getSystemPrompt: jest.fn().mockReturnValue('system prompt'),
 }));
@ -73,12 +63,21 @@ const { Agent } =
 	// eslint-disable-next-line @typescript-eslint/no-require-imports
 	require('@mastra/core/agent') as { Agent: jest.Mock };

+function createMcpManagerStub() {
+	return {
+		getRegularTools: jest.fn().mockResolvedValue({}),
+		getBrowserTools: jest.fn().mockResolvedValue({}),
+		disconnect: jest.fn().mockResolvedValue(undefined),
+	};
+}
+
 describe('createInstanceAgent', () => {
 	it('creates a fresh deferred tool processor for each run-scoped toolset', async () => {
 		const memoryConfig = {
 			storage: { id: 'memory-store' },
 		} as never;

+		const mcpManager = createMcpManagerStub();
 		const createOptions = (runId: string) =>
 			({
 				modelId: 'test-model',
@ -93,6 +92,7 @@ describe('createInstanceAgent', () => {
 					browserMcpConfig: undefined,
 				},
 				memoryConfig,
+				mcpManager,
 			}) as never;

 		await createInstanceAgent(createOptions('run-1'));
@ -129,6 +129,7 @@ describe('createInstanceAgent', () => {
 				workspace: fakeWorkspace,
 			},
 			memoryConfig,
+			mcpManager: createMcpManagerStub(),
 			// Exercise the deprecated field to confirm it is ignored.
 			workspace: fakeWorkspace,
 		} as never);
--- a/packages/@n8n/instance-ai/src/agent/tests/system-prompt.test.ts
+++ b/packages/@n8n/instance-ai/src/agent/tests/system-prompt.test.ts
@ -208,4 +208,37 @@ describe('getSystemPrompt', () => {
 			expect(prompt).toContain('With a single candidate, auto-apply and do not ask');
 		});
 	});
+
+	describe('trigger URL patterns', () => {
+		const webhookBaseUrl = 'http://localhost:5678/webhook';
+		const formBaseUrl = 'http://localhost:5678/form';
+
+		it('serves Form Trigger URLs under the /form base, not /webhook', () => {
+			const prompt = getSystemPrompt({ webhookBaseUrl, formBaseUrl });
+
+			expect(prompt).toContain('**Form Trigger**: http://localhost:5678/form/{path}');
+			expect(prompt).toContain('http://localhost:5678/form/{webhookId}');
+			expect(prompt).not.toContain('**Form Trigger**: http://localhost:5678/webhook/');
+		});
+
+		it('keeps Webhook Trigger and Chat Trigger on the webhook base URL', () => {
+			const prompt = getSystemPrompt({ webhookBaseUrl, formBaseUrl });
+
+			expect(prompt).toContain('**Webhook Trigger**: http://localhost:5678/webhook/{path}');
+			expect(prompt).toContain('**Chat Trigger**: http://localhost:5678/webhook/{webhookId}/chat');
+		});
+
+		it('explicitly warns that /form and /webhook are distinct prefixes', () => {
+			const prompt = getSystemPrompt({ webhookBaseUrl, formBaseUrl });
+
+			expect(prompt).toMatch(/Form Trigger lives under \/form\/, NOT \/webhook\//);
+			expect(prompt).toContain('Do NOT use the Webhook base URL for Form Triggers');
+		});
+
+		it('omits the Instance Info section when base URLs are not provided', () => {
+			const prompt = getSystemPrompt({});
+
+			expect(prompt).not.toContain('## Instance Info');
+		});
+	});
 });
--- a/packages/@n8n/instance-ai/src/agent/computer-use-prompt.ts
+++ b/packages/@n8n/instance-ai/src/agent/computer-use-prompt.ts
@ -0,0 +1,115 @@
+import { type LocalGatewayStatus } from '@/types';
+
+const BROWSER_USE_EXTENSION_URL =
+	'https://chromewebstore.google.com/detail/n8n-browser-use/cegmdpndekdfpnafgacidejijecomlhh';
+
+export function getComputerUsePrompt({
+	browserAvailable,
+	localGateway,
+}: {
+	browserAvailable: boolean | undefined;
+	localGateway: LocalGatewayStatus | undefined;
+}) {
+	if (localGateway && localGateway.status !== 'disabledGlobally') {
+		const promptParts: string[] = [];
+
+		promptParts.push(`
+## Computer Use
+This instance support "Computer Use", which allows connecting to user's computer and execute following functionality:
+- *filesystem* - read and write files. Use it when users want to include their own files in the automation.
+- *shell* - Execute shell commands. Use it when you need or are asked to execute commands on user's computer
+- *browser* - Automate user's browser to access web pages and do tasks on user's behalf. Use it when you require access to user's browser session for example when creating credentials with user's accounts. Requires installing the "n8n Browser Use" Chrome extension from the Chrome Web Store: ${BROWSER_USE_EXTENSION_URL}
+- *screenshot*, *mouse-keyboard* - control user's computer mouse, keyboard and do screenshots (do not advertise or use this functionality if user does not explicitly ask for it)
+
+Users have control over this functionality and can enable only the tools they want to provide.
+Users can reconnect Computer Use with different set of functionality, so always rely on Computer Use status and the available tools and not the conversation history.
+
+### When to suggest or use Computer Use
+
+Proactively suggest Computer Use (or use it directly if connected) when you detect these signals:
+
+- **Credential / OAuth setup** — user needs to set up, create, configure, or connect credentials for any service that requires OAuth or API key generation through a web portal (Slack, Google, Microsoft, HubSpot, Notion, Stripe, Twilio, etc.) → *browser*
+- **Local file as context** — user mentions a file, PDF, CSV, spec, or requirements doc they want to use as reference while building a workflow → *filesystem*
+- **Documentation / output to files** — user asks to document, write up, export, or save a workflow description, runbook, or handover doc → *filesystem*
+- **Authenticated web research** — user wants to check something on a site they're logged into, or gather data from a web-based tool → *browser*
+- **Form / frontend testing** — user is building n8n forms or a web app with n8n as backend and wants end-to-end testing → *browser*
+- **Shell / environment** — user asks to run a command (curl, CLI, DB query), automate something locally, or debug connectivity → *shell*
+- **Platform migration** — user wants to migrate from Make, Zapier, or another automation platform, or replicate an existing workflow from it → *browser* + *filesystem*
+`);
+
+		promptParts.push(`
+### Computer Use status`);
+
+		switch (localGateway.status) {
+			case 'connected':
+				if (localGateway.capabilities.length > 0) {
+					promptParts.push(
+						`Computer Use is connected, the user has enabled following capabilities: ${localGateway.capabilities.join(',')}`,
+					);
+					if (localGateway.capabilities.includes('filesystem')) {
+						promptParts.push(`
+### Computer Use - Filesystem Exploration
+
+Keep exploration shallow: start at depth 1–2, prefer \`search\` over browsing, and read specific files rather than whole directories.`);
+					}
+					if (browserAvailable) {
+						promptParts.push(`
+### Computer Use - Browser Automation rules
+
+You can control the user's browser using the browser_* tools. Since this is their real browser, you share it with them.
+
+#### Handing control to the user
+
+When the user needs to act in the browser, **end your turn** with a clear message explaining what they should do. Resume after they reply. Hand off when:
+- **Authentication** — login pages, OAuth, SSO, 2FA/MFA prompts
+- **CAPTCHAs or visual challenges** — you cannot solve these
+- **Accessing downloads** — you can click download buttons, but you cannot open or read downloaded files; ask the user to open the file and share the content you need
+- **Sensitive content on screen** — passwords, tokens, secrets visible in the browser
+- **User requests manual control** — they explicitly want to do something themselves
+
+After the user confirms they're done, take a snapshot to verify before continuing.
+
+#### Secrets and sensitive data
+
+**NEVER include passwords, API keys, tokens, or secrets in your chat messages** — even if visible on a page. If the user asks you to retrieve a secret, tell them to read it directly from their browser.
+
+#### When browser tools fail at runtime
+
+If a browser_* tool call fails because the browser is unreachable (e.g. connection lost, extension not responding), ask the user to verify the **n8n Browser Use** Chrome extension is installed and connected. If needed, they can reinstall from the Chrome Web Store: ${BROWSER_USE_EXTENSION_URL}`);
+					} else {
+						promptParts.push(`
+### Browser Automation (Disabled in Computer Use)
+
+Browser tools are not enabled in the user's Computer Use configuration. If the user asks for browser automation, tell them to (1) enable browser tools in their Computer Use config, and (2) install the n8n Browser Use Chrome extension from the Chrome Web Store: ${BROWSER_USE_EXTENSION_URL}`);
+					}
+				} else {
+					promptParts.push(
+						'Computer Use is connected, but the user did not enable any capabilities',
+					);
+				}
+
+				break;
+			case 'disconnected':
+				promptParts.push(
+					`Computer Use is not connected. Do NOT attempt to use Computer Use tools — they are not available. You can provide these instructions to establish a connection:
+1. open the right sidebar
+2. click on the "..." button next to "Computer Use"
+3. click on "Connect" and follow the instructions in the dialog`,
+				);
+				break;
+			case 'disabled':
+				promptParts.push(
+					`Computer Use is not connected and not set-up. Do NOT attempt to use Computer Use tools — they are not available. You can provide these instructions to establish a connection:
+1. open the right sidebar
+2. click on "Setup computer use"
+3. follow the instructions in the dialog`,
+				);
+				break;
+			default:
+		}
+
+		return promptParts.join('\n');
+	}
+
+	return '';
+}
--- a/packages/@n8n/instance-ai/src/agent/instance-agent.ts
+++ b/packages/@n8n/instance-ai/src/agent/instance-agent.ts
@ -3,43 +3,13 @@ import { Agent } from '@mastra/core/agent';
 import { Mastra } from '@mastra/core/mastra';
 import { ToolSearchProcessor, type ToolSearchProcessorOptions } from '@mastra/core/processors';
 import type { MastraCompositeStore } from '@mastra/core/storage';
-import { MCPClient } from '@mastra/mcp';
-import { nanoid } from 'nanoid';

 import { createMemory } from '../memory/memory-config';
 import { createAllTools, createOrchestratorDomainTools, createOrchestrationTools } from '../tools';
-import { sanitizeMcpToolSchemas } from './sanitize-mcp-schemas';
 import { getSystemPrompt } from './system-prompt';
 import { createToolsFromLocalMcpServer } from '../tools/filesystem/create-tools-from-mcp-server';
 import { buildAgentTraceInputs, mergeTraceRunInputs } from '../tracing/langsmith-tracing';
-import type { CreateInstanceAgentOptions, McpServerConfig } from '../types';
-function buildMcpServers(
-	configs: McpServerConfig[],
-): Record<
-	string,
-	{ url: URL } | { command: string; args?: string[]; env?: Record<string, string> }
-> {
-	const servers: Record<
-		string,
-		{ url: URL } | { command: string; args?: string[]; env?: Record<string, string> }
-	> = {};
-	for (const server of configs) {
-		if (server.url) {
-			servers[server.name] = { url: new URL(server.url) };
-		} else if (server.command) {
-			servers[server.name] = { command: server.command, args: server.args, env: server.env };
-		}
-	}
-	return servers;
-}
-
-// ── Cached MCP tools (expensive to initialize — spawn processes, connect, list) ──
-
-let cachedMcpTools: ToolsInput | null = null;
-let cachedMcpServersKey = '';
-
-let cachedBrowserMcpTools: ToolsInput | null = null;
-let cachedBrowserMcpKey = '';
+import type { CreateInstanceAgentOptions } from '../types';

 let cachedMastra: Mastra | null = null;
 let cachedMastraStorageKey = '';
@ -58,40 +28,6 @@ function getOrCreateToolSearchProcessor(tools: ToolsInput): ToolSearchProcessor
 	});
 }

-async function getMcpTools(mcpServers: McpServerConfig[]): Promise<ToolsInput> {
-	const key = JSON.stringify(mcpServers);
-	if (cachedMcpTools && cachedMcpServersKey === key) return cachedMcpTools;
-
-	if (mcpServers.length === 0) {
-		cachedMcpTools = {};
-		cachedMcpServersKey = key;
-		return cachedMcpTools;
-	}
-
-	const mcpClient = new MCPClient({
-		id: `mcp-${nanoid(6)}`,
-		servers: buildMcpServers(mcpServers),
-	});
-	cachedMcpTools = sanitizeMcpToolSchemas(await mcpClient.listTools());
-	cachedMcpServersKey = key;
-	return cachedMcpTools;
-}
-
-async function getBrowserMcpTools(config: McpServerConfig | undefined): Promise<ToolsInput> {
-	if (!config) return {};
-
-	const key = JSON.stringify(config);
-	if (cachedBrowserMcpTools && cachedBrowserMcpKey === key) return cachedBrowserMcpTools;
-
-	const browserClient = new MCPClient({
-		id: `browser-mcp-${nanoid(6)}`,
-		servers: buildMcpServers([config]),
-	});
-	cachedBrowserMcpTools = sanitizeMcpToolSchemas(await browserClient.listTools());
-	cachedBrowserMcpKey = key;
-	return cachedBrowserMcpTools;
-}
-
 function ensureMastraRegistered(agent: Agent, storage: MastraCompositeStore): void {
 	const key = storage.id ?? 'default';
 	if (!cachedMastra || cachedMastraStorageKey !== key) {
@ -112,6 +48,7 @@ export async function createInstanceAgent(options: CreateInstanceAgentOptions):
 		context,
 		orchestrationContext,
 		mcpServers = [],
+		mcpManager,
 		memoryConfig,
 		disableDeferredTools = false,
 	} = options;
@ -121,9 +58,10 @@ export async function createInstanceAgent(options: CreateInstanceAgentOptions):

 	const orchestratorDomainTools = createOrchestratorDomainTools(context);

-	// Load MCP tools (cached — only spawns processes on first call or config change)
-	const mcpTools = await getMcpTools(mcpServers);
-	const browserMcpTools = await getBrowserMcpTools(orchestrationContext?.browserMcpConfig);
+	// Load MCP tools (cached by config-hash inside the manager — only spawns
+	// processes / opens connections on first call or config change).
+	const mcpTools = await mcpManager.getRegularTools(mcpServers);
+	const browserMcpTools = await mcpManager.getBrowserTools(orchestrationContext?.browserMcpConfig);

 	// Browser tool names — used to exclude them from the orchestrator's direct toolset.
 	// Browser tools are only accessible via browser-credential-setup (sub-agent) to prevent
@ -208,7 +146,7 @@ export async function createInstanceAgent(options: CreateInstanceAgentOptions):
 	const systemPrompt = getSystemPrompt({
 		researchMode: orchestrationContext?.researchMode,
 		webhookBaseUrl: orchestrationContext?.webhookBaseUrl,
-		filesystemAccess: (context.localMcpServer?.getToolsByCategory('filesystem').length ?? 0) > 0,
+		formBaseUrl: orchestrationContext?.formBaseUrl,
 		localGateway: context.localGatewayStatus,
 		toolSearchEnabled: hasDeferrableTools,
 		licenseHints: context.licenseHints,
--- a/packages/@n8n/instance-ai/src/agent/system-prompt.ts
+++ b/packages/@n8n/instance-ai/src/agent/system-prompt.ts
@ -1,16 +1,14 @@
 import { DateTime } from 'luxon';

+import { getComputerUsePrompt } from './computer-use-prompt';
 import { SECRET_ASK_GUARDRAIL } from './credential-guardrails.prompt';
 import { UNTRUSTED_CONTENT_DOCTRINE } from './shared-prompts';
 import type { LocalGatewayStatus } from '../types';

-const BROWSER_USE_EXTENSION_URL =
-	'https://chromewebstore.google.com/detail/n8n-browser-use/cegmdpndekdfpnafgacidejijecomlhh';
-
 interface SystemPromptOptions {
 	researchMode?: boolean;
 	webhookBaseUrl?: string;
-	filesystemAccess?: boolean;
+	formBaseUrl?: string;
 	localGateway?: LocalGatewayStatus;
 	toolSearchEnabled?: boolean;
 	/** Human-readable hints about licensed features that are NOT available on this instance. */
@ -33,119 +31,22 @@ The user's current local date and time is: ${isoTime}${tzLabel}.
 When you need to reference "now", use this date and time.`;
 }

-function getInstanceInfoSection(webhookBaseUrl: string): string {
+function getInstanceInfoSection(webhookBaseUrl: string, formBaseUrl: string): string {
 	return `
 ## Instance Info

 Webhook base URL: ${webhookBaseUrl}
+Form base URL: ${formBaseUrl}

 Some trigger nodes expose HTTP endpoints. Always share the full production URL with the user after building a workflow that uses one of these triggers. Each type has a distinct URL pattern:

 - **Webhook Trigger**: ${webhookBaseUrl}/{path} (where {path} is the node's webhook path parameter).
- **Form Trigger**: ${webhookBaseUrl}/{path} (or ${webhookBaseUrl}/{webhookId} if no custom path is set). Same pattern as Webhook — no /chat suffix.
+- **Form Trigger**: ${formBaseUrl}/{path} (or ${formBaseUrl}/{webhookId} if no custom path is set). The Form Trigger lives under /form/, NOT /webhook/ — they are separate URL prefixes. Do NOT use the Webhook base URL for Form Triggers.
 - **Chat Trigger**: ${webhookBaseUrl}/{webhookId}/chat (where {webhookId} is the node's unique webhook ID, visible in the workflow JSON). The /chat suffix is unique to Chat Trigger — do NOT append it to Form Trigger or Webhook URLs. The public chat UI is only accessible to end users when the node's "public" parameter is true and the workflow has been published. (This applies only to end-user HTTP access — your own testing via \`executions(action="run")\` and \`verify-built-workflow\` works regardless of publish state.) Do NOT guess the webhookId — read the workflow to find it.

 **These URLs are for sharing with the user only.** Do NOT include them in \`build-workflow-with-agent\` task descriptions — the builder cannot reach the n8n instance via HTTP and will fail if it tries to curl/fetch these URLs.`;
 }

-function getFilesystemSection(
-	filesystemAccess: boolean | undefined,
-	localGateway: LocalGatewayStatus | undefined,
-	webhookBaseUrl?: string,
-): string {
-	// When gateway status is explicitly provided, use multi-way logic
-	if (localGateway?.status === 'disconnected') {
-		const capabilityLines: string[] = [];
-		if (localGateway.capabilities.includes('filesystem')) {
-			capabilityLines.push('- **Filesystem access** — browse, read, and search project files');
-		}
-		if (localGateway.capabilities.includes('browser')) {
-			capabilityLines.push(
-				"- **Browser control** — automate browser interactions on the user's machine",
-			);
-		}
-		const capList =
-			capabilityLines.length > 0
-				? capabilityLines.join('\n')
-				: '- Local machine access capabilities';
-		const instanceUrl = webhookBaseUrl ? new URL(webhookBaseUrl).origin : '<your-instance-url>';
-		return `
-## Computer Use (Not Connected)
-
-A **Computer Use** can connect this n8n instance to the user's local machine, providing:
-${capList}
-
-The gateway is not currently connected. When the user asks for something that requires local machine access (reading files, browsing, etc.), let them know they can connect by either:
-
-1. **Run via CLI:** \`npx @n8n/computer-use ${instanceUrl}\`
-
-Do NOT attempt to use Computer Use tools — they are not available until the gateway connects.`;
-	}
-
-	if (filesystemAccess) {
-		return `
-## Project Filesystem Access
-
-You have read-only access to the user's project files via the \`filesystem\` tool with actions: \`tree\`, \`search\`, \`read\`, \`list\`. Explore the project before building workflows that depend on user data shapes.
-
-Keep exploration shallow — start at depth 1-2, prefer \`search\` over browsing, read specific files not whole directories.`;
-	}
-
-	return `
-## No Filesystem Access
-
-You do NOT have access to the user's project files. The filesystem tool is not available. Do not attempt to use it or claim you can browse the user's codebase.`;
-}
-
-function getBrowserSection(
-	browserAvailable: boolean | undefined,
-	localGateway: LocalGatewayStatus | undefined,
-): string {
-	if (!browserAvailable) {
-		if (localGateway?.status === 'disconnected') {
-			return `
-
-## Browser Automation (Unavailable)
-
-Browser tools require both the Computer Use daemon (see above) **and** the n8n Browser Use Chrome extension. If the user asks for browser automation, tell them to start the daemon and install the extension from the Chrome Web Store: ${BROWSER_USE_EXTENSION_URL}`;
-		}
-
-		if (localGateway?.status === 'connected') {
-			return `
-
-## Browser Automation (Disabled in Computer Use)
-
-Browser tools are not enabled in the user's Computer Use configuration. If the user asks for browser automation, tell them to (1) enable browser tools in their Computer Use config, and (2) install the n8n Browser Use Chrome extension from the Chrome Web Store: ${BROWSER_USE_EXTENSION_URL}`;
-		}
-
-		return '';
-	}
-	return `
-
-## Browser Automation
-
-You can control the user's browser using the browser_* tools. Since this is their real browser, you share it with them.
-
-### Handing control to the user
-
-When the user needs to act in the browser, **end your turn** with a clear message explaining what they should do. Resume after they reply. Hand off when:
- **Authentication** — login pages, OAuth, SSO, 2FA/MFA prompts
- **CAPTCHAs or visual challenges** — you cannot solve these
- **Accessing downloads** — you can click download buttons, but you cannot open or read downloaded files; ask the user to open the file and share the content you need
- **Sensitive content on screen** — passwords, tokens, secrets visible in the browser
- **User requests manual control** — they explicitly want to do something themselves
-
-After the user confirms they're done, take a snapshot to verify before continuing.
-
-### Secrets and sensitive data
-
-**NEVER include passwords, API keys, tokens, or secrets in your chat messages** — even if visible on a page. If the user asks you to retrieve a secret, tell them to read it directly from their browser.
-
-### When browser tools fail at runtime
-
-If a browser_* tool call fails because the browser is unreachable (e.g. connection lost, extension not responding), ask the user to verify the **n8n Browser Use** Chrome extension is installed and connected. If needed, they can reinstall from the Chrome Web Store: ${BROWSER_USE_EXTENSION_URL}`;
-}
-
 function getReadOnlySection(branchReadOnly?: boolean): string {
 	if (!branchReadOnly) return '';
 	return `
@ -172,7 +73,7 @@ export function getSystemPrompt(options: SystemPromptOptions = {}): string {
 	const {
 		researchMode,
 		webhookBaseUrl,
-		filesystemAccess,
+		formBaseUrl,
 		localGateway,
 		toolSearchEnabled,
 		licenseHints,
@ -183,7 +84,7 @@ export function getSystemPrompt(options: SystemPromptOptions = {}): string {

 	return `You are the n8n Instance Agent — an AI assistant embedded in an n8n instance. You help users build, run, debug, and manage workflows through natural language.
 ${getDateTimeSection(timeZone)}
-${webhookBaseUrl ? getInstanceInfoSection(webhookBaseUrl) : ''}
+${webhookBaseUrl && formBaseUrl ? getInstanceInfoSection(webhookBaseUrl, formBaseUrl) : ''}

 You have access to workflow, execution, and credential tools plus a specialized workflow builder. You also have delegation capabilities for complex tasks, and may have access to MCP tools for extended capabilities.

@ -283,8 +184,7 @@ You have the \`research\` tool with \`web-search\` and \`fetch-url\` actions. Us
 }

 ${UNTRUSTED_CONTENT_DOCTRINE}
-${getFilesystemSection(filesystemAccess, localGateway, webhookBaseUrl)}
-${getBrowserSection(browserAvailable, localGateway)}
+${getComputerUsePrompt({ browserAvailable, localGateway })}

 ${
 	licenseHints && licenseHints.length > 0
--- a/packages/@n8n/instance-ai/src/index.ts
+++ b/packages/@n8n/instance-ai/src/index.ts
@ -44,6 +44,7 @@ export {
 	MastraIterationLogStorage,
 	MastraTaskStorage,
 	PlannedTaskStorage,
+	TerminalOutcomeStorage,
 	patchThread,
 	WorkflowLoopStorage,
 } from './storage';
@ -53,6 +54,7 @@ export type {
 	IterationLog,
 	PatchableThreadMemory,
 	ThreadPatch,
+	TerminalOutcome,
 	WorkflowLoopWorkItemRecord,
 } from './storage';
 export { truncateToTitle, generateTitleForRun } from './memory/title-utils';
@ -90,6 +92,12 @@ export type {
 	StartedRunState,
 	SuspendedRunState,
 } from './runtime/run-state-registry';
+export { InstanceAiTerminalResponseGuard } from './runtime/terminal-response-guard';
+export type {
+	TerminalResponseDecision,
+	TerminalResponseStatus,
+	TerminalVisibilitySource,
+} from './runtime/terminal-response-guard';
 export { executeResumableStream } from './runtime/resumable-stream-executor';
 export type {
 	AutoResumeControl,
@ -100,6 +108,7 @@ export type {
 	ResumableStreamControl,
 	ResumableStreamSource,
 } from './runtime/resumable-stream-executor';
+export type { WorkSummary } from './stream/work-summary-accumulator';
 export { resumeAgentRun, streamAgentRun } from './runtime/stream-runner';
 export type {
 	StreamableAgent,
--- a/packages/@n8n/instance-ai/src/mcp/tests/mcp-client-manager.test.ts
+++ b/packages/@n8n/instance-ai/src/mcp/tests/mcp-client-manager.test.ts
@ -0,0 +1,252 @@
+jest.mock('@mastra/mcp', () => ({
+	MCPClient: jest.fn().mockImplementation(() => ({
+		listTools: jest.fn().mockResolvedValue({}),
+		disconnect: jest.fn().mockResolvedValue(undefined),
+	})),
+}));
+
+jest.mock('../../agent/sanitize-mcp-schemas', () => ({
+	sanitizeMcpToolSchemas: jest.fn((tools: Record<string, unknown>) => tools),
+}));
+
+import { createResultError, createResultOk, UserError } from 'n8n-workflow';
+
+import type { SsrfUrlValidator } from '../mcp-client-manager';
+import { McpClientManager } from '../mcp-client-manager';
+
+const { MCPClient: mockedMcpClient } =
+	// eslint-disable-next-line @typescript-eslint/no-require-imports
+	require('@mastra/mcp') as { MCPClient: jest.Mock };
+
+function createValidatorMock(): jest.Mocked<SsrfUrlValidator> {
+	return {
+		validateUrl: jest.fn().mockResolvedValue(createResultOk(undefined)),
+	} as jest.Mocked<SsrfUrlValidator>;
+}
+
+describe('McpClientManager', () => {
+	beforeEach(() => {
+		jest.clearAllMocks();
+	});
+
+	describe('protocol whitelist (always-on)', () => {
+		it('accepts https URLs', async () => {
+			const manager = new McpClientManager();
+			await expect(
+				manager.getRegularTools([{ name: 'github', url: 'https://api.github.com/mcp' }]),
+			).resolves.toBeDefined();
+			expect(mockedMcpClient).toHaveBeenCalledTimes(1);
+		});
+
+		it('accepts http URLs', async () => {
+			const manager = new McpClientManager();
+			await expect(
+				manager.getRegularTools([{ name: 'local', url: 'http://localhost:3000/sse' }]),
+			).resolves.toBeDefined();
+		});
+
+		it('rejects file:// URLs with a UserError naming the server', async () => {
+			const manager = new McpClientManager();
+			await expect(
+				manager.getRegularTools([{ name: 'sneaky', url: 'file:///etc/passwd' }]),
+			).rejects.toThrow(UserError);
+			await expect(
+				manager.getRegularTools([{ name: 'sneaky', url: 'file:///etc/passwd' }]),
+			).rejects.toThrow(/MCP server "sneaky".*file:/);
+			expect(mockedMcpClient).not.toHaveBeenCalled();
+		});
+
+		it('rejects ws:// URLs', async () => {
+			const manager = new McpClientManager();
+			await expect(
+				manager.getRegularTools([{ name: 'sock', url: 'ws://example.com/' }]),
+			).rejects.toThrow(/only http\(s\) URLs are allowed/);
+			expect(mockedMcpClient).not.toHaveBeenCalled();
+		});
+
+		it('rejects malformed URLs', async () => {
+			const manager = new McpClientManager();
+			await expect(manager.getRegularTools([{ name: 'broken', url: 'not a url' }])).rejects.toThrow(
+				/invalid URL/,
+			);
+			expect(mockedMcpClient).not.toHaveBeenCalled();
+		});
+
+		it('skips URL validation for stdio configs', async () => {
+			const manager = new McpClientManager();
+			await expect(
+				manager.getRegularTools([
+					{ name: 'local-stdio', command: '/usr/bin/mcp-server', args: ['--port', '3000'] },
+				]),
+			).resolves.toBeDefined();
+			expect(mockedMcpClient).toHaveBeenCalledTimes(1);
+		});
+	});
+
+	describe('SSRF policy (opt-in)', () => {
+		it('does not call validateUrl when no validator is supplied', async () => {
+			const manager = new McpClientManager();
+			await manager.getRegularTools([{ name: 'public', url: 'https://api.example.com/mcp' }]);
+			// no validator → never invoked; confirm by absence of any later expectations
+		});
+
+		it('calls validateUrl for every configured URL when a validator is supplied', async () => {
+			const validator = createValidatorMock();
+			const manager = new McpClientManager(validator);
+			await manager.getRegularTools([
+				{ name: 'a', url: 'https://a.example.com/mcp' },
+				{ name: 'b', url: 'https://b.example.com/mcp' },
+			]);
+			expect(validator.validateUrl).toHaveBeenCalledTimes(2);
+			expect(validator.validateUrl).toHaveBeenCalledWith('https://a.example.com/mcp');
+			expect(validator.validateUrl).toHaveBeenCalledWith('https://b.example.com/mcp');
+		});
+
+		it('rejects with UserError when validateUrl returns blocked', async () => {
+			const validator = createValidatorMock();
+			validator.validateUrl.mockResolvedValue(createResultError(new Error('blocked: 10.0.0.1')));
+			const manager = new McpClientManager(validator);
+			await expect(
+				manager.getRegularTools([{ name: 'internal', url: 'http://10.0.0.1/mcp' }]),
+			).rejects.toThrow(UserError);
+			expect(mockedMcpClient).not.toHaveBeenCalled();
+		});
+
+		it('error message names the server and surfaces the policy reason', async () => {
+			const validator = createValidatorMock();
+			validator.validateUrl.mockResolvedValue(createResultError(new Error('blocked: 10.0.0.1')));
+			const manager = new McpClientManager(validator);
+			await expect(
+				manager.getRegularTools([{ name: 'internal', url: 'http://10.0.0.1/mcp' }]),
+			).rejects.toThrow(/MCP server "internal".*blocked: 10\.0\.0\.1/);
+		});
+
+		it('skips SSRF check for stdio configs even when validator is supplied', async () => {
+			const validator = createValidatorMock();
+			const manager = new McpClientManager(validator);
+			await manager.getRegularTools([{ name: 'stdio', command: '/usr/bin/mcp' }]);
+			expect(validator.validateUrl).not.toHaveBeenCalled();
+		});
+
+		it('applies validation to browser MCP config too', async () => {
+			const validator = createValidatorMock();
+			validator.validateUrl.mockResolvedValue(createResultError(new Error('blocked')));
+			const manager = new McpClientManager(validator);
+			await expect(
+				manager.getBrowserTools({ name: 'browser', url: 'http://internal/' }),
+			).rejects.toThrow(UserError);
+		});
+	});
+
+	describe('disconnect', () => {
+		it('disconnects every tracked client and clears caches', async () => {
+			const manager = new McpClientManager();
+			await manager.getRegularTools([{ name: 'a', url: 'https://a.example.com/' }]);
+			await manager.getBrowserTools({ name: 'b', url: 'https://b.example.com/' });
+			expect(mockedMcpClient).toHaveBeenCalledTimes(2);
+
+			const disconnectMocks = mockedMcpClient.mock.results.map(
+				(r) => (r.value as { disconnect: jest.Mock }).disconnect,
+			);
+
+			await manager.disconnect();
+
+			for (const d of disconnectMocks) {
+				expect(d).toHaveBeenCalledTimes(1);
+			}
+		});
+	});
+
+	describe('caching', () => {
+		it('does not re-list tools for an unchanged config', async () => {
+			const manager = new McpClientManager();
+			const configs = [{ name: 'a', url: 'https://a.example.com/' }];
+			await manager.getRegularTools(configs);
+			await manager.getRegularTools(configs);
+			expect(mockedMcpClient).toHaveBeenCalledTimes(1);
+		});
+
+		it('keeps regular and browser caches separate', async () => {
+			const manager = new McpClientManager();
+			await manager.getRegularTools([{ name: 'shared', url: 'https://shared.example.com/' }]);
+			await manager.getBrowserTools({ name: 'shared', url: 'https://shared.example.com/' });
+			// Same config shape but different bucket → two clients
+			expect(mockedMcpClient).toHaveBeenCalledTimes(2);
+		});
+	});
+
+	describe('concurrent dedup', () => {
+		it('coalesces concurrent regular-tool calls with the same config into one client', async () => {
+			const manager = new McpClientManager();
+			const configs = [{ name: 'a', url: 'https://a.example.com/' }];
+
+			const [tools1, tools2] = await Promise.all([
+				manager.getRegularTools(configs),
+				manager.getRegularTools(configs),
+			]);
+
+			expect(mockedMcpClient).toHaveBeenCalledTimes(1);
+			expect(tools1).toBe(tools2);
+		});
+
+		it('coalesces concurrent browser-tool calls with the same config into one client', async () => {
+			const manager = new McpClientManager();
+			const config = { name: 'browser', url: 'https://browser.example.com/' };
+
+			await Promise.all([manager.getBrowserTools(config), manager.getBrowserTools(config)]);
+
+			expect(mockedMcpClient).toHaveBeenCalledTimes(1);
+		});
+
+		it('lets the next call retry after an in-flight failure', async () => {
+			const manager = new McpClientManager();
+			const configs = [{ name: 'a', url: 'https://a.example.com/' }];
+
+			mockedMcpClient.mockImplementationOnce(() => ({
+				listTools: jest.fn().mockRejectedValue(new Error('boom')),
+				disconnect: jest.fn().mockResolvedValue(undefined),
+			}));
+
+			await expect(manager.getRegularTools(configs)).rejects.toThrow('boom');
+			// In-flight entry must be cleared so a retry actually re-attempts.
+			await expect(manager.getRegularTools(configs)).resolves.toBeDefined();
+			expect(mockedMcpClient).toHaveBeenCalledTimes(2);
+		});
+	});
+
+	describe('disconnect interaction with in-flight work', () => {
+		// Returns a deferred listTools promise we can resolve later, simulating a
+		// long-running tool listing that's still pending when disconnect() runs.
+		function deferListTools() {
+			let resolve: (value: Record<string, unknown>) => void = () => {};
+			const promise = new Promise<Record<string, unknown>>((r) => {
+				resolve = r;
+			});
+			return { promise, resolve };
+		}
+
+		it('does not coalesce new calls with in-flight work that disconnect severed', async () => {
+			const manager = new McpClientManager();
+			const configs = [{ name: 'a', url: 'https://a.example.com/' }];
+
+			const deferred = deferListTools();
+			mockedMcpClient.mockImplementationOnce(() => ({
+				listTools: jest.fn().mockReturnValue(deferred.promise),
+				disconnect: jest.fn().mockResolvedValue(undefined),
+			}));
+
+			const stranded = manager.getRegularTools(configs);
+			// Yield so connectAndListTools registers the client before we tear down.
+			await Promise.resolve();
+			await manager.disconnect();
+
+			// New call must start a fresh client, not join the stranded promise.
+			await manager.getRegularTools(configs);
+			expect(mockedMcpClient).toHaveBeenCalledTimes(2);
+
+			// Cleanup: let the stranded promise settle so the test doesn't hang.
+			deferred.resolve({});
+			await stranded.catch(() => {});
+		});
+	});
+});
--- a/packages/@n8n/instance-ai/src/mcp/mcp-client-manager.ts
+++ b/packages/@n8n/instance-ai/src/mcp/mcp-client-manager.ts
@ -1,38 +1,175 @@
+import type { ToolsInput } from '@mastra/core/agent';
 import { MCPClient } from '@mastra/mcp';
+import type { Result } from 'n8n-workflow';
+import { UserError } from 'n8n-workflow';
+import { nanoid } from 'nanoid';

+import { sanitizeMcpToolSchemas } from '../agent/sanitize-mcp-schemas';
 import type { McpServerConfig } from '../types';

-export class McpClientManager {
-	private mcpClient: MCPClient | undefined;
+/**
+ * SSRF policy gate for outbound MCP URLs. The cli's `SsrfProtectionService`
+ * satisfies this structurally; we keep the local shape narrow to avoid pulling
+ * `n8n-core` into this package just for one type.
+ */
+export interface SsrfUrlValidator {
+	validateUrl(url: string | URL): Promise<Result<void, Error>>;
+}

-	async connect(servers: McpServerConfig[]): Promise<Record<string, unknown>> {
-		if (servers.length === 0) return {};
+type McpServerEntry =
+	| { url: URL }
+	| { command: string; args?: string[]; env?: Record<string, string> };

-		const serverMap: Record<
-			string,
-			{ url: URL } | { command: string; args?: string[]; env?: Record<string, string> }
-		> = {};
-
-		for (const server of servers) {
-			if (server.url) {
-				serverMap[server.name] = { url: new URL(server.url) };
-			} else if (server.command) {
-				serverMap[server.name] = {
-					command: server.command,
-					args: server.args,
-					env: server.env,
-				};
-			}
+function buildMcpServers(configs: McpServerConfig[]): Record<string, McpServerEntry> {
+	const servers: Record<string, McpServerEntry> = {};
+	for (const server of configs) {
+		if (server.url) {
+			servers[server.name] = { url: new URL(server.url) };
+		} else if (server.command) {
+			servers[server.name] = { command: server.command, args: server.args, env: server.env };
 		}
+	}
+	return servers;
+}

-		this.mcpClient = new MCPClient({ servers: serverMap });
-		return await this.mcpClient.listTools();
+/**
+ * Owns the lifecycle of MCP client connections used by the orchestrator.
+ *
+ * Two buckets:
+ * - **regular**: external MCP servers configured by the admin. Their tools are
+ *   merged into the orchestrator's toolset.
+ * - **browser**: Chrome DevTools MCP. Excluded from the orchestrator (context
+ *   bloat from screenshots) and only handed to `browser-credential-setup`
+ *   sub-agents.
+ *
+ * Tool listings are cached by config-hash; clients are tracked in a single map
+ * so `disconnect()` cleans up everything regardless of which bucket created
+ * them.
+ *
+ * URLs are validated before the underlying `MCPClient` is constructed:
+ * - Protocol whitelist (`http:` / `https:`) is always enforced.
+ * - SSRF policy is opt-in via `ssrfValidator`. The cli supplies one when
+ *   `N8N_SSRF_PROTECTION_ENABLED` is on, matching how other admin-configured
+ *   outbound URLs (workflow imports, HTTP Request node) handle the same flag.
+ */
+export class McpClientManager {
+	private regularToolsByKey = new Map<string, ToolsInput>();
+	private browserToolsByKey = new Map<string, ToolsInput>();
+
+	private inFlightRegularByKey = new Map<string, Promise<ToolsInput>>();
+	private inFlightBrowserByKey = new Map<string, Promise<ToolsInput>>();
+
+	private clientsByKey = new Map<string, MCPClient>();
+
+	constructor(private readonly ssrfValidator?: SsrfUrlValidator) {}
+
+	async getRegularTools(configs: McpServerConfig[]): Promise<ToolsInput> {
+		if (configs.length === 0) return {};
+
+		const key = JSON.stringify(configs);
+		return await this.getOrLoad(
+			this.regularToolsByKey,
+			this.inFlightRegularByKey,
+			key,
+			async () => {
+				await this.validateConfigs(configs);
+				return await this.connectAndListTools(`mcp-${nanoid(6)}`, configs, key);
+			},
+		);
+	}
+
+	async getBrowserTools(config: McpServerConfig | undefined): Promise<ToolsInput> {
+		if (!config) return {};
+
+		const key = JSON.stringify(config);
+		return await this.getOrLoad(
+			this.browserToolsByKey,
+			this.inFlightBrowserByKey,
+			key,
+			async () => {
+				await this.validateConfigs([config]);
+				return await this.connectAndListTools(`browser-mcp-${nanoid(6)}`, [config], key);
+			},
+		);
 	}

 	async disconnect(): Promise<void> {
-		if (this.mcpClient) {
-			await this.mcpClient.disconnect();
-			this.mcpClient = undefined;
+		const clients = [...this.clientsByKey.values()];
+		this.clientsByKey.clear();
+		this.regularToolsByKey.clear();
+		this.browserToolsByKey.clear();
+		this.inFlightRegularByKey.clear();
+		this.inFlightBrowserByKey.clear();
+		await Promise.all(clients.map(async (c) => await c.disconnect()));
+	}
+
+	/**
+	 * Returns a cached value if present, otherwise dedupes concurrent producers
+	 * by sharing a single in-flight promise per key. Successful results are
+	 * committed to the cache; failures clear the in-flight entry so the next
+	 * call retries from scratch.
+	 */
+	private async getOrLoad<T>(
+		cache: Map<string, T>,
+		inFlight: Map<string, Promise<T>>,
+		key: string,
+		produce: () => Promise<T>,
+	): Promise<T> {
+		const cached = cache.get(key);
+		if (cached) return cached;
+
+		const pending = inFlight.get(key);
+		if (pending) return await pending;
+
+		const promise = (async () => {
+			const value = await produce();
+			cache.set(key, value);
+			return value;
+		})();
+
+		inFlight.set(key, promise);
+		try {
+			return await promise;
+		} finally {
+			inFlight.delete(key);
 		}
 	}
+
+	private async validateConfigs(configs: McpServerConfig[]): Promise<void> {
+		for (const server of configs) {
+			if (!server.url) continue; // stdio transport — no URL to validate
+
+			let parsed: URL;
+			try {
+				parsed = new URL(server.url);
+			} catch {
+				throw new UserError(`MCP server "${server.name}": invalid URL "${server.url}"`);
+			}
+
+			if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
+				throw new UserError(
+					`MCP server "${server.name}": only http(s) URLs are allowed, got "${parsed.protocol}"`,
+				);
+			}
+
+			if (this.ssrfValidator) {
+				const result = await this.ssrfValidator.validateUrl(server.url);
+				if (!result.ok) {
+					throw new UserError(
+						`MCP server "${server.name}": URL blocked by SSRF policy — ${result.error.message}`,
+					);
+				}
+			}
+		}
+	}
+
+	private async connectAndListTools(
+		id: string,
+		configs: McpServerConfig[],
+		clientKey: string,
+	): Promise<ToolsInput> {
+		const client = new MCPClient({ id, servers: buildMcpServers(configs) });
+		this.clientsByKey.set(clientKey, client);
+		return sanitizeMcpToolSchemas(await client.listTools());
+	}
 }
--- a/packages/@n8n/instance-ai/src/runtime/tests/stream-runner.test.ts
+++ b/packages/@n8n/instance-ai/src/runtime/tests/stream-runner.test.ts
@ -75,6 +75,7 @@ describe('streamAgentRun', () => {

 		expect(result.status).toBe('errored');
 		expect(result.mastraRunId).toBe('mastra-run-1');
+		expect(result.workSummary).toBe(emptyWorkSummary);
 	});

 	it('returns completed status for successful streams', async () => {
@ -106,6 +107,7 @@ describe('streamAgentRun', () => {
 		);

 		expect(result.status).toBe('completed');
+		expect(result.workSummary).toBe(emptyWorkSummary);
 	});

 	it('passes through the buffered manual confirmation event', async () => {
@ -158,6 +160,7 @@ describe('streamAgentRun', () => {

 		expect(result.status).toBe('suspended');
 		expect(result.mastraRunId).toBe('mastra-run-1');
+		expect(result.workSummary).toBe(emptyWorkSummary);
 		expect(result.suspension?.requestId).toBe('request-1');
 		expect(result.confirmationEvent?.type).toBe('confirmation-request');
 		expect(result.confirmationEvent?.payload.requestId).toBe('request-1');
--- a/packages/@n8n/instance-ai/src/runtime/tests/terminal-response-guard.test.ts
+++ b/packages/@n8n/instance-ai/src/runtime/tests/terminal-response-guard.test.ts
@ -0,0 +1,211 @@
+import type { InstanceAiEvent } from '@n8n/api-types';
+
+import { InstanceAiTerminalResponseGuard } from '../terminal-response-guard';
+
+const runId = 'run-1';
+const rootAgentId = 'agent-root';
+const guard = () => new InstanceAiTerminalResponseGuard({ runId, rootAgentId });
+
+function runStart(): InstanceAiEvent {
+	return {
+		type: 'run-start',
+		runId,
+		agentId: rootAgentId,
+		payload: { messageId: 'msg-1', messageGroupId: 'mg-1' },
+	};
+}
+
+function rootText(text = 'hello'): InstanceAiEvent {
+	return {
+		type: 'text-delta',
+		runId,
+		agentId: rootAgentId,
+		payload: { text },
+	};
+}
+
+function rootError(content = 'failed'): InstanceAiEvent {
+	return {
+		type: 'error',
+		runId,
+		agentId: rootAgentId,
+		payload: { content },
+	};
+}
+
+function childText(): InstanceAiEvent {
+	return {
+		type: 'text-delta',
+		runId,
+		agentId: 'child-agent',
+		payload: { text: 'visible child output' },
+	};
+}
+
+function confirmation(
+	overrides: Partial<Extract<InstanceAiEvent, { type: 'confirmation-request' }>['payload']> = {},
+): Extract<InstanceAiEvent, { type: 'confirmation-request' }> {
+	return {
+		type: 'confirmation-request',
+		runId,
+		agentId: rootAgentId,
+		payload: {
+			requestId: 'req-1',
+			toolCallId: 'tc-1',
+			toolName: 'pause-for-user',
+			args: {},
+			severity: 'info',
+			message: 'Please confirm',
+			...overrides,
+		},
+	};
+}
+
+describe('InstanceAiTerminalResponseGuard', () => {
+	it('does not emit fallback when a completed run already has root text', () => {
+		const decision = guard().evaluateTerminal([runStart(), rootText()], 'completed');
+
+		expect(decision.action).toBe('none');
+		expect(decision.visibilitySource).toBe('root-text');
+	});
+
+	it('emits text fallback for silent completed runs with structured work counts only', () => {
+		const decision = guard().evaluateTerminal([runStart()], 'completed', {
+			workSummary: { totalToolCalls: 3, totalToolErrors: 1, toolCalls: [] },
+		});
+
+		expect(decision.action).toBe('emit');
+		expect(decision.event?.type).toBe('text-delta');
+		expect(decision.event?.payload).toEqual({
+			text: 'I finished the run, but I did not generate a final response. I ran 3 tools; 1 tool errored.',
+		});
+	});
+
+	it('emits sanitized error when partial root text is followed by failure', () => {
+		const decision = guard().evaluateTerminal([runStart(), rootText('partial')], 'errored', {
+			errorMessage: 'Safe error',
+		});
+
+		expect(decision.action).toBe('emit');
+		expect(decision.reason).toBe('errored-after-text');
+		expect(decision.event).toMatchObject({
+			type: 'error',
+			payload: { content: 'Safe error' },
+		});
+	});
+
+	it('does not emit cancellation fallback when partial root text exists', () => {
+		const decision = guard().evaluateTerminal([runStart(), rootText('partial')], 'cancelled');
+
+		expect(decision.action).toBe('none');
+		expect(decision.visibilitySource).toBe('root-text');
+	});
+
+	it('logs root error then completed as already visible', () => {
+		const decision = guard().evaluateTerminal([runStart(), rootError()], 'completed');
+
+		expect(decision.action).toBe('none');
+		expect(decision.reason).toBe('completed-after-error');
+	});
+
+	it('does not count sub-agent text as root visibility', () => {
+		const decision = guard().evaluateTerminal([runStart(), childText()], 'completed');
+
+		expect(decision.action).toBe('emit');
+		expect(decision.reason).toBe('completed-silent');
+	});
+
+	it('does not emit duplicate fallback for the same run', () => {
+		const first = guard().evaluateTerminal([runStart()], 'completed');
+		const second = guard().evaluateTerminal([runStart(), first.event!], 'completed');
+
+		expect(first.action).toBe('emit');
+		expect(second.action).toBe('none');
+		expect(second.reason).toBe('already-emitted');
+	});
+
+	it('does not let a prior retry fallback hide the current silent run', () => {
+		const decision = guard().evaluateTerminal(
+			[
+				runStart(),
+				{
+					type: 'error',
+					runId: 'run-previous',
+					agentId: 'agent-001',
+					responseId: 'terminal-fallback:run-previous:errored',
+					payload: { content: 'Previous attempt failed.' },
+				},
+			],
+			'errored',
+		);
+
+		expect(decision.action).toBe('emit');
+		expect(decision.reason).toBe('errored-silent');
+	});
+
+	it('does not let a prior retry fallback hide a malformed confirmation payload', () => {
+		const decision = guard().evaluateWaiting(
+			[
+				runStart(),
+				{
+					type: 'error',
+					runId: 'run-previous',
+					agentId: rootAgentId,
+					responseId: 'terminal-fallback:run-previous:errored',
+					payload: { content: 'Previous attempt failed.' },
+				},
+			],
+			confirmation({ inputType: 'plan-review', message: 'message-only plan' }),
+		);
+
+		expect(decision.action).toBe('emit');
+		expect(decision.reason).toBe('confirmation-invalid');
+	});
+
+	it('treats displayable confirmation UI as visible waiting output', () => {
+		const decision = guard().evaluateWaiting([runStart()], confirmation());
+
+		expect(decision.action).toBe('none');
+		expect(decision.visibilitySource).toBe('confirmation-ui');
+	});
+
+	it('emits deterministic error for malformed confirmation payloads', () => {
+		const decision = guard().evaluateWaiting(
+			[runStart()],
+			confirmation({ inputType: 'plan-review', message: 'message-only plan' }),
+		);
+
+		expect(decision.action).toBe('emit');
+		expect(decision.reason).toBe('confirmation-invalid');
+		expect(decision.event?.type).toBe('error');
+	});
+
+	it('does not let prior root text hide a malformed confirmation payload', () => {
+		const decision = guard().evaluateWaiting(
+			[runStart(), rootText()],
+			confirmation({ inputType: 'plan-review', message: 'message-only plan' }),
+		);
+
+		expect(decision.action).toBe('emit');
+		expect(decision.reason).toBe('confirmation-invalid');
+		expect(decision.event?.type).toBe('error');
+	});
+
+	it('does not let prior root errors hide a malformed confirmation payload', () => {
+		const decision = guard().evaluateWaiting(
+			[runStart(), rootError()],
+			confirmation({ inputType: 'plan-review', message: 'message-only plan' }),
+		);
+
+		expect(decision.action).toBe('emit');
+		expect(decision.reason).toBe('confirmation-invalid');
+		expect(decision.event?.type).toBe('error');
+	});
+
+	it('does not emit fallback when prior root text precedes a valid confirmation', () => {
+		const decision = guard().evaluateWaiting([runStart(), rootText()], confirmation());
+
+		expect(decision.action).toBe('none');
+		expect(decision.reason).toBe('already-visible');
+	});
+});
--- a/packages/@n8n/instance-ai/src/runtime/stream-runner.ts
+++ b/packages/@n8n/instance-ai/src/runtime/stream-runner.ts
@ -9,6 +9,7 @@ import {
 	type ResumableStreamSource,
 	type TraceStatus,
 } from './resumable-stream-executor';
+import type { WorkSummary } from '../stream/work-summary-accumulator';
 import { getTraceParentRun, withTraceParentContext } from '../tracing/langsmith-tracing';
 import { asResumable } from '../utils/stream-helpers';
 import type { SuspensionInfo } from '../utils/stream-helpers';
@ -30,6 +31,7 @@ export interface StreamRunResult {
 	status: TraceStatus;
 	mastraRunId: string;
 	text?: Promise<string>;
+	workSummary: WorkSummary;
 	suspension?: SuspensionInfo;
 	confirmationEvent?: Extract<InstanceAiEvent, { type: 'confirmation-request' }>;
 }
@ -96,6 +98,7 @@ async function consumeStream(
 			status: 'suspended',
 			mastraRunId: result.mastraRunId,
 			text: result.text,
+			workSummary: result.workSummary,
 			suspension: result.suspension,
 			...(result.confirmationEvent ? { confirmationEvent: result.confirmationEvent } : {}),
 		};
@ -110,5 +113,6 @@ async function consumeStream(
 					: 'completed',
 		mastraRunId: result.mastraRunId,
 		text: result.text,
+		workSummary: result.workSummary,
 	};
 }
--- a/packages/@n8n/instance-ai/src/runtime/terminal-response-guard.ts
+++ b/packages/@n8n/instance-ai/src/runtime/terminal-response-guard.ts
@ -0,0 +1,245 @@
+import {
+	isDisplayableConfirmationRequest,
+	type InstanceAiConfirmationRequestEvent,
+	type InstanceAiEvent,
+} from '@n8n/api-types';
+
+import type { WorkSummary } from '../stream/work-summary-accumulator';
+
+export type TerminalResponseStatus = 'completed' | 'cancelled' | 'errored' | 'waiting';
+
+export type TerminalVisibilitySource =
+	| 'root-text'
+	| 'root-error'
+	| 'confirmation-ui'
+	| 'fallback'
+	| 'none';
+
+export interface TerminalResponseGuardOptions {
+	runId: string;
+	rootAgentId: string;
+	messageGroupId?: string;
+	correlationId?: string;
+}
+
+export interface TerminalResponseDecision {
+	status: TerminalResponseStatus;
+	visibilitySource: TerminalVisibilitySource;
+	action: 'none' | 'emit';
+	reason:
+		| 'already-visible'
+		| 'already-emitted'
+		| 'completed-silent'
+		| 'cancelled-silent'
+		| 'errored-silent'
+		| 'errored-after-text'
+		| 'completed-after-error'
+		| 'confirmation-visible'
+		| 'confirmation-invalid';
+	event?: InstanceAiEvent;
+}
+
+const FALLBACK_RESPONSE_PREFIX = 'terminal-fallback';
+
+function pluralize(count: number, singular: string, plural = `${singular}s`): string {
+	return count === 1 ? singular : plural;
+}
+
+function formatWorkSummaryCounts(workSummary?: WorkSummary): string {
+	if (!workSummary || workSummary.totalToolCalls === 0) return '';
+
+	const toolText = `${workSummary.totalToolCalls} ${pluralize(workSummary.totalToolCalls, 'tool')}`;
+	if (workSummary.totalToolErrors === 0) return ` I ran ${toolText}.`;
+
+	return ` I ran ${toolText}; ${workSummary.totalToolErrors} ${pluralize(
+		workSummary.totalToolErrors,
+		'tool',
+	)} errored.`;
+}
+
+function hasText(event: InstanceAiEvent): boolean {
+	return event.type === 'text-delta' && event.payload.text.trim().length > 0;
+}
+
+export class InstanceAiTerminalResponseGuard {
+	constructor(private readonly options: TerminalResponseGuardOptions) {}
+
+	evaluateTerminal(
+		events: InstanceAiEvent[],
+		status: Exclude<TerminalResponseStatus, 'waiting'>,
+		options: { workSummary?: WorkSummary; errorMessage?: string } = {},
+	): TerminalResponseDecision {
+		const visibility = this.getVisibility(events);
+		if (visibility.hasCurrentRunFallback) {
+			return {
+				status,
+				visibilitySource: 'fallback',
+				action: 'none',
+				reason: 'already-emitted',
+			};
+		}
+
+		if (status === 'completed') {
+			if (visibility.hasRootError) {
+				return {
+					status,
+					visibilitySource: 'root-error',
+					action: 'none',
+					reason: 'completed-after-error',
+				};
+			}
+			if (visibility.hasRootText) {
+				return {
+					status,
+					visibilitySource: 'root-text',
+					action: 'none',
+					reason: 'already-visible',
+				};
+			}
+			return this.emitText(
+				status,
+				'completed-silent',
+				`I finished the run, but I did not generate a final response.${formatWorkSummaryCounts(
+					options.workSummary,
+				)}`,
+			);
+		}
+
+		if (status === 'cancelled') {
+			if (visibility.hasRootText || visibility.hasRootError) {
+				return {
+					status,
+					visibilitySource: visibility.hasRootError ? 'root-error' : 'root-text',
+					action: 'none',
+					reason: 'already-visible',
+				};
+			}
+			return this.emitText(
+				status,
+				'cancelled-silent',
+				'The run was cancelled before I could send a response.',
+			);
+		}
+
+		if (visibility.hasRootError) {
+			return {
+				status,
+				visibilitySource: 'root-error',
+				action: 'none',
+				reason: 'already-visible',
+			};
+		}
+
+		return this.emitError(
+			status,
+			visibility.hasRootText ? 'errored-after-text' : 'errored-silent',
+			options.errorMessage ??
+				'I hit an error before I could finish that response. Please try again.',
+		);
+	}
+
+	evaluateWaiting(
+		events: InstanceAiEvent[],
+		confirmationEvent: InstanceAiConfirmationRequestEvent | undefined,
+	): TerminalResponseDecision {
+		const visibility = this.getVisibility(events);
+		if (visibility.hasCurrentRunFallback) {
+			return {
+				status: 'waiting',
+				visibilitySource: 'fallback',
+				action: 'none',
+				reason: 'already-emitted',
+			};
+		}
+
+		const hasDisplayableConfirmation =
+			confirmationEvent !== undefined &&
+			isDisplayableConfirmationRequest(confirmationEvent.payload);
+		if (!hasDisplayableConfirmation) {
+			return this.emitError(
+				'waiting',
+				'confirmation-invalid',
+				'I need your input to continue, but I could not display the prompt. Please try again.',
+			);
+		}
+
+		if (visibility.hasRootText || visibility.hasRootError) {
+			return {
+				status: 'waiting',
+				visibilitySource: visibility.hasRootError ? 'root-error' : 'root-text',
+				action: 'none',
+				reason: 'already-visible',
+			};
+		}
+
+		return {
+			status: 'waiting',
+			visibilitySource: 'confirmation-ui',
+			action: 'none',
+			reason: 'confirmation-visible',
+		};
+	}
+
+	private getVisibility(events: InstanceAiEvent[]): {
+		hasRootText: boolean;
+		hasRootError: boolean;
+		hasCurrentRunFallback: boolean;
+	} {
+		const currentRunEvents = events.filter((event) => event.runId === this.options.runId);
+		return {
+			hasRootText: currentRunEvents.some(
+				(event) => event.agentId === this.options.rootAgentId && hasText(event),
+			),
+			hasRootError: currentRunEvents.some(
+				(event) => event.agentId === this.options.rootAgentId && event.type === 'error',
+			),
+			hasCurrentRunFallback: currentRunEvents.some((event) =>
+				event.responseId?.startsWith(`${FALLBACK_RESPONSE_PREFIX}:${this.options.runId}:`),
+			),
+		};
+	}
+
+	private emitText(
+		status: TerminalResponseStatus,
+		reason: TerminalResponseDecision['reason'],
+		text: string,
+	): TerminalResponseDecision {
+		return {
+			status,
+			visibilitySource: 'none',
+			action: 'emit',
+			reason,
+			event: {
+				type: 'text-delta',
+				runId: this.options.runId,
+				agentId: this.options.rootAgentId,
+				responseId: this.fallbackResponseId(status),
+				payload: { text },
+			},
+		};
+	}
+
+	private emitError(
+		status: TerminalResponseStatus,
+		reason: TerminalResponseDecision['reason'],
+		content: string,
+	): TerminalResponseDecision {
+		return {
+			status,
+			visibilitySource: 'none',
+			action: 'emit',
+			reason,
+			event: {
+				type: 'error',
+				runId: this.options.runId,
+				agentId: this.options.rootAgentId,
+				responseId: this.fallbackResponseId(status),
+				payload: { content },
+			},
+		};
+	}
+
+	private fallbackResponseId(status: TerminalResponseStatus): string {
+		return `${FALLBACK_RESPONSE_PREFIX}:${this.options.runId}:${status}`;
+	}
+}
--- a/packages/@n8n/instance-ai/src/storage/tests/terminal-outcome-storage.test.ts
+++ b/packages/@n8n/instance-ai/src/storage/tests/terminal-outcome-storage.test.ts
@ -0,0 +1,116 @@
+import type { Memory } from '@mastra/memory';
+
+jest.mock('../thread-patch', () => ({
+	patchThread: jest.fn(),
+}));
+
+import { TerminalOutcomeStorage, type TerminalOutcome } from '../terminal-outcome-storage';
+import { patchThread } from '../thread-patch';
+
+const mockedPatchThread = jest.mocked(patchThread);
+
+function makeMemory(): Memory {
+	return {
+		getThreadById: jest.fn(),
+	} as unknown as Memory;
+}
+
+function makeOutcome(overrides: Partial<TerminalOutcome> = {}): TerminalOutcome {
+	return {
+		id: 'outcome-1',
+		threadId: 'thread-1',
+		runId: 'run-1',
+		taskId: 'task-1',
+		agentId: 'agent-1',
+		status: 'completed',
+		userFacingMessage: 'done',
+		createdAt: '2026-05-02T00:00:00.000Z',
+		...overrides,
+	};
+}
+
+describe('TerminalOutcomeStorage', () => {
+	let memory: Memory;
+	let storage: TerminalOutcomeStorage;
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+		memory = makeMemory();
+		storage = new TerminalOutcomeStorage(memory);
+	});
+
+	describe('getUndelivered()', () => {
+		it('returns valid undelivered outcomes when one stored entry is malformed', async () => {
+			const valid = makeOutcome({ id: 'outcome-valid' });
+			(memory.getThreadById as jest.Mock).mockResolvedValue({
+				metadata: {
+					instanceAiTerminalOutcomes: {
+						'outcome-valid': valid,
+						'outcome-broken': { id: 'outcome-broken' },
+					},
+				},
+			});
+
+			const result = await storage.getUndelivered('thread-1');
+
+			expect(result).toEqual([valid]);
+		});
+
+		it('returns empty list when metadata is missing', async () => {
+			(memory.getThreadById as jest.Mock).mockResolvedValue({ metadata: {} });
+
+			const result = await storage.getUndelivered('thread-1');
+
+			expect(result).toEqual([]);
+		});
+
+		it('skips delivered outcomes', async () => {
+			const undelivered = makeOutcome({ id: 'undelivered' });
+			const delivered = makeOutcome({
+				id: 'delivered',
+				deliveredAt: '2026-05-02T00:00:01.000Z',
+			});
+			(memory.getThreadById as jest.Mock).mockResolvedValue({
+				metadata: {
+					instanceAiTerminalOutcomes: {
+						undelivered,
+						delivered,
+					},
+				},
+			});
+
+			const result = await storage.getUndelivered('thread-1');
+
+			expect(result).toEqual([undelivered]);
+		});
+	});
+
+	describe('upsert()', () => {
+		it('preserves valid outcomes when an existing entry is malformed', async () => {
+			const valid = makeOutcome({ id: 'outcome-valid' });
+			const next = makeOutcome({ id: 'outcome-new' });
+
+			let captured: Record<string, unknown> | undefined;
+			mockedPatchThread.mockImplementation(async (_memory, args) => {
+				await Promise.resolve();
+				const patch = args.update({
+					metadata: {
+						instanceAiTerminalOutcomes: {
+							'outcome-valid': valid,
+							'outcome-broken': { not: 'an outcome' },
+						},
+					},
+				} as unknown as Parameters<typeof args.update>[0]);
+				captured = patch?.metadata?.instanceAiTerminalOutcomes as Record<string, unknown>;
+				return null;
+			});
+
+			await storage.upsert('thread-1', next);
+
+			expect(captured).toEqual({
+				'outcome-valid': valid,
+				'outcome-new': next,
+			});
+		});
+	});
+});
--- a/packages/@n8n/instance-ai/src/storage/agent-tree-snapshot.ts
+++ b/packages/@n8n/instance-ai/src/storage/agent-tree-snapshot.ts
@ -7,4 +7,6 @@ export interface AgentTreeSnapshot {
 	runIds?: string[];
 	langsmithRunId?: string;
 	langsmithTraceId?: string;
+	createdAt?: Date;
+	updatedAt?: Date;
 }
--- a/packages/@n8n/instance-ai/src/storage/index.ts
+++ b/packages/@n8n/instance-ai/src/storage/index.ts
@ -4,6 +4,8 @@ export type { IterationEntry, IterationLog } from './iteration-log';
 export { MastraIterationLogStorage } from './mastra-iteration-log-storage';
 export { MastraTaskStorage } from './mastra-task-storage';
 export { PlannedTaskStorage } from './planned-task-storage';
+export { TerminalOutcomeStorage } from './terminal-outcome-storage';
+export type { TerminalOutcome } from './terminal-outcome-storage';
 export { patchThread } from './thread-patch';
 export type { PatchableThreadMemory, ThreadPatch } from './thread-patch';
 export { WorkflowLoopStorage } from './workflow-loop-storage';
--- a/packages/@n8n/instance-ai/src/storage/terminal-outcome-storage.ts
+++ b/packages/@n8n/instance-ai/src/storage/terminal-outcome-storage.ts
@ -0,0 +1,86 @@
+import type { Memory } from '@mastra/memory';
+import { z } from 'zod';
+
+import { patchThread } from './thread-patch';
+
+const METADATA_KEY = 'instanceAiTerminalOutcomes';
+
+const terminalOutcomeStatusSchema = z.enum(['completed', 'failed', 'cancelled']);
+
+const terminalOutcomeSchema = z.object({
+	id: z.string(),
+	threadId: z.string(),
+	runId: z.string(),
+	messageGroupId: z.string().optional(),
+	correlationId: z.string().optional(),
+	taskId: z.string(),
+	agentId: z.string(),
+	status: terminalOutcomeStatusSchema,
+	userFacingMessage: z.string(),
+	createdAt: z.string(),
+	deliveredAt: z.string().optional(),
+});
+
+export type TerminalOutcome = z.infer<typeof terminalOutcomeSchema>;
+
+export class TerminalOutcomeStorage {
+	constructor(private readonly memory: Memory) {}
+
+	async upsert(threadId: string, outcome: TerminalOutcome): Promise<void> {
+		await patchThread(this.memory, {
+			threadId,
+			update: ({ metadata = {} }) => {
+				const current = parseOutcomes(metadata[METADATA_KEY]);
+				return {
+					metadata: {
+						...metadata,
+						[METADATA_KEY]: {
+							...current,
+							[outcome.id]: outcome,
+						},
+					},
+				};
+			},
+		});
+	}
+
+	async markDelivered(threadId: string, outcomeId: string, deliveredAt: string): Promise<void> {
+		await patchThread(this.memory, {
+			threadId,
+			update: ({ metadata = {} }) => {
+				const current = parseOutcomes(metadata[METADATA_KEY]);
+				const outcome = current[outcomeId];
+				if (!outcome) return null;
+
+				return {
+					metadata: {
+						...metadata,
+						[METADATA_KEY]: {
+							...current,
+							[outcomeId]: {
+								...outcome,
+								deliveredAt,
+							},
+						},
+					},
+				};
+			},
+		});
+	}
+
+	async getUndelivered(threadId: string): Promise<TerminalOutcome[]> {
+		const thread = await this.memory.getThreadById({ threadId });
+		const outcomes = parseOutcomes(thread?.metadata?.[METADATA_KEY]);
+		return Object.values(outcomes).filter((outcome) => !outcome.deliveredAt);
+	}
+}
+
+function parseOutcomes(raw: unknown): Record<string, TerminalOutcome> {
+	if (!raw || typeof raw !== 'object') return {};
+	const outcomes: Record<string, TerminalOutcome> = {};
+	for (const [key, value] of Object.entries(raw as Record<string, unknown>)) {
+		const parsed = terminalOutcomeSchema.safeParse(value);
+		if (parsed.success) outcomes[key] = parsed.data;
+	}
+	return outcomes;
+}
--- a/packages/@n8n/instance-ai/src/tools/tests/workflows.tool.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/tests/workflows.tool.test.ts
@ -36,7 +36,7 @@ function createMockContext(
 			createFromWorkflowJSON: jest.fn(),
 			updateFromWorkflowJSON: jest.fn(),
 			archive: jest.fn(),
-			delete: jest.fn(),
+			unarchive: jest.fn(),
 			publish: jest.fn().mockResolvedValue({ activeVersionId: 'v1' }),
 			unpublish: jest.fn(),
 		},
@ -148,6 +148,7 @@ describe('workflows tool', () => {
 					name: 'Test Workflow',
 					versionId: 'v1',
 					activeVersionId: null,
+					isArchived: false,
 					createdAt: '2024-01-01',
 					updatedAt: '2024-01-01',
 				},
@ -161,6 +162,26 @@ describe('workflows tool', () => {
 			expect(context.workflowService.list).toHaveBeenCalledWith({ limit: 10, query: 'test' });
 			expect(result).toEqual({ workflows });
 		});
+
+		it('should pass archived status when listing archived workflows', async () => {
+			const context = createMockContext();
+			(context.workflowService.list as jest.Mock).mockResolvedValue([]);
+
+			const tool = createWorkflowsTool(context, 'full');
+			await tool.execute!({ action: 'list', status: 'archived' }, {} as never);
+
+			expect(context.workflowService.list).toHaveBeenCalledWith({ status: 'archived' });
+		});
+
+		it('should pass all status when listing all workflows', async () => {
+			const context = createMockContext();
+			(context.workflowService.list as jest.Mock).mockResolvedValue([]);
+
+			const tool = createWorkflowsTool(context, 'full');
+			await tool.execute!({ action: 'list', status: 'all' }, {} as never);
+
+			expect(context.workflowService.list).toHaveBeenCalledWith({ status: 'all' });
+		});
 	});

 	describe('get action', () => {
@ -172,6 +193,7 @@ describe('workflows tool', () => {
 				connections: {},
 				versionId: 'v1',
 				activeVersionId: null,
+				isArchived: false,
 				createdAt: '2024-01-01',
 				updatedAt: '2024-01-01',
 			};
@ -211,7 +233,7 @@ describe('workflows tool', () => {
 			const suspend = jest.fn();

 			const tool = createWorkflowsTool(context, 'full');
-			await tool.execute!({ action: 'delete', workflowId: 'wf1' }, {
+			const result = await tool.execute!({ action: 'delete', workflowId: 'wf1' }, {
 				agent: { suspend, resumeData: undefined },
 			} as never);

@ -221,6 +243,11 @@ describe('workflows tool', () => {
 				message: expect.stringContaining('My WF'),
 				severity: 'warning',
 			});
+			expect(result).toEqual({
+				success: false,
+				denied: true,
+				reason: 'Awaiting confirmation',
+			});
 		});

 		it('should fall back to workflowId in message when lookup fails', async () => {
@ -267,6 +294,97 @@ describe('workflows tool', () => {
 		});
 	});

+	describe('unarchive action', () => {
+		it('should return denied when permission is blocked', async () => {
+			const context = createMockContext({
+				permissions: { deleteWorkflow: 'blocked' },
+			});
+
+			const tool = createWorkflowsTool(context, 'full');
+			const result = await tool.execute!({ action: 'unarchive', workflowId: 'wf1' }, {} as never);
+
+			expect(result).toEqual({
+				success: false,
+				denied: true,
+				reason: 'Action blocked by admin',
+			});
+			expect(context.workflowService.unarchive).not.toHaveBeenCalled();
+		});
+
+		it('should suspend for confirmation using the looked-up workflow name', async () => {
+			const context = createMockContext();
+			(context.workflowService.get as jest.Mock).mockResolvedValue({
+				id: 'wf1',
+				name: 'Archived WF',
+			});
+			const suspend = jest.fn();
+
+			const tool = createWorkflowsTool(context, 'full');
+			const result = await tool.execute!({ action: 'unarchive', workflowId: 'wf1' }, {
+				agent: { suspend, resumeData: undefined },
+			} as never);
+
+			expect(context.workflowService.get).toHaveBeenCalledWith('wf1');
+			expect(suspend).toHaveBeenCalled();
+			expect(suspend.mock.calls[0][0]).toMatchObject({
+				message: expect.stringContaining('Archived WF'),
+				severity: 'warning',
+			});
+			expect(suspend.mock.calls[0][0].message).toContain('will not publish it');
+			expect(result).toEqual({
+				success: false,
+				denied: true,
+				reason: 'Awaiting confirmation',
+			});
+		});
+
+		it('should return the suspension result when approval is pending', async () => {
+			const context = createMockContext();
+			(context.workflowService.get as jest.Mock).mockResolvedValue({
+				id: 'wf1',
+				name: 'Archived WF',
+			});
+			const suspension = { suspended: true };
+			const suspend = jest.fn().mockResolvedValue(suspension);
+
+			const tool = createWorkflowsTool(context, 'full');
+			const result = await tool.execute!({ action: 'unarchive', workflowId: 'wf1' }, {
+				agent: { suspend, resumeData: undefined },
+			} as never);
+
+			expect(result).toBe(suspension);
+			expect(context.workflowService.unarchive).not.toHaveBeenCalled();
+		});
+
+		it('should unarchive when approved via resume', async () => {
+			const context = createMockContext();
+
+			const tool = createWorkflowsTool(context, 'full');
+			const result = await tool.execute!({ action: 'unarchive', workflowId: 'wf1' }, {
+				agent: { resumeData: { approved: true } },
+			} as never);
+
+			expect(context.workflowService.unarchive).toHaveBeenCalledWith('wf1');
+			expect(result).toEqual({ success: true });
+		});
+
+		it('should return denied when user rejects', async () => {
+			const context = createMockContext();
+
+			const tool = createWorkflowsTool(context, 'full');
+			const result = await tool.execute!({ action: 'unarchive', workflowId: 'wf1' }, {
+				agent: { resumeData: { approved: false } },
+			} as never);
+
+			expect(result).toEqual({
+				success: false,
+				denied: true,
+				reason: 'User denied the action',
+			});
+			expect(context.workflowService.unarchive).not.toHaveBeenCalled();
+		});
+	});
+
 	describe('publish action', () => {
 		it('should return denied when permission is blocked', async () => {
 			const context = createMockContext({
--- a/packages/@n8n/instance-ai/src/tools/attachments/tests/parse-file.tool.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/attachments/tests/parse-file.tool.test.ts
@ -19,7 +19,7 @@ function createMockContext(overrides?: Partial<InstanceAiContext>): InstanceAiCo
 			createFromWorkflowJSON: jest.fn(),
 			updateFromWorkflowJSON: jest.fn(),
 			archive: jest.fn(),
-			delete: jest.fn(),
+			unarchive: jest.fn(),
 			publish: jest.fn(),
 			unpublish: jest.fn(),
 			clearAiTemporary: jest.fn(),
--- a/packages/@n8n/instance-ai/src/tools/orchestration/tests/browser-credential-setup.nudge.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/orchestration/tests/browser-credential-setup.nudge.test.ts
@ -0,0 +1,34 @@
+import { buildNudgeStreamInput, NUDGE_PROMPT } from '../browser-credential-setup.nudge';
+
+describe('buildNudgeStreamInput', () => {
+	it('returns the bare nudge string when there are no prior messages', () => {
+		const result = buildNudgeStreamInput([]);
+
+		expect(result).toBe(NUDGE_PROMPT);
+	});
+
+	it('appends a nudge user message after the prior conversation', () => {
+		const prior = [
+			{ role: 'user' as const, content: 'briefing' },
+			{ role: 'assistant' as const, content: 'I clicked some buttons' },
+		];
+
+		const result = buildNudgeStreamInput(prior);
+
+		expect(Array.isArray(result)).toBe(true);
+		const messages = result as Array<{ role: string; content: string }>;
+		expect(messages).toHaveLength(3);
+		expect(messages[0]).toBe(prior[0]);
+		expect(messages[1]).toBe(prior[1]);
+		expect(messages[2]).toEqual({ role: 'user', content: NUDGE_PROMPT });
+	});
+
+	it('does not mutate the input array', () => {
+		const prior = [{ role: 'user' as const, content: 'briefing' }];
+		const snapshot = [...prior];
+
+		buildNudgeStreamInput(prior);
+
+		expect(prior).toEqual(snapshot);
+	});
+});
--- a/packages/@n8n/instance-ai/src/tools/orchestration/browser-credential-setup.nudge.ts
+++ b/packages/@n8n/instance-ai/src/tools/orchestration/browser-credential-setup.nudge.ts
@ -0,0 +1,25 @@
+/**
+ * Sent to the sub-agent when it stops without calling `pause-for-user`. The
+ * agent has already done the browser work — it just forgot the final
+ * confirmation step that hands the user back the credential locations.
+ */
+export const NUDGE_PROMPT =
+	'You stopped without confirming with the user. Call pause-for-user NOW to tell the user where the credential values live and to enter them privately in the n8n credential form.';
+
+/**
+ * Build the input for a nudge `subAgent.stream()` call. Each Mastra
+ * `agent.stream()` invocation starts a fresh conversation — passing only the
+ * nudge string would land in an empty context and the sub-agent would
+ * apologetically give up. We replay the prior conversation as input so the
+ * sub-agent has full context (briefing, browser actions, tool results) when
+ * composing its `pause-for-user` message.
+ *
+ * Returns the bare nudge string when there's nothing to replay (defensive
+ * fallback — should not happen in practice once the first stream has run).
+ */
+export function buildNudgeStreamInput<M>(
+	priorMessages: readonly M[],
+): Array<M | { role: 'user'; content: string }> | string {
+	if (priorMessages.length === 0) return NUDGE_PROMPT;
+	return [...priorMessages, { role: 'user', content: NUDGE_PROMPT }];
+}
--- a/packages/@n8n/instance-ai/src/tools/orchestration/browser-credential-setup.tool.ts
+++ b/packages/@n8n/instance-ai/src/tools/orchestration/browser-credential-setup.tool.ts
@ -5,6 +5,7 @@ import { instanceAiConfirmationSeveritySchema } from '@n8n/api-types';
 import { nanoid } from 'nanoid';
 import { z } from 'zod';

+import { buildNudgeStreamInput } from './browser-credential-setup.nudge';
 import { buildBrowserAgentPrompt, type BrowserToolSource } from './browser-credential-setup.prompt';
 import {
 	failTraceRun,
@ -319,19 +320,20 @@ export function createBrowserCredentialSetupTool(context: OrchestrationContext)

 							if (lastSuspendedToolName !== 'pause-for-user' && nudgeCount < MAX_NUDGES) {
 								// Agent ended without a final pause-for-user confirmation.
-								// Re-invoke with a nudge to call pause-for-user.
+								// Replay the prior conversation + a nudge so the sub-agent
+								// has full context to finish — Mastra `stream()` is otherwise
+								// stateless across calls.
 								nudgeCount++;
-								const nudge = await subAgent.stream(
-									'You stopped without confirming with the user. Call pause-for-user NOW to tell the user where the credential values live and to enter them privately in the n8n credential form.',
-									{
-										maxSteps: MAX_STEPS.BROWSER,
-										abortSignal: context.abortSignal,
-										providerOptions: {
-											anthropic: { cacheControl: { type: 'ephemeral' } },
-										},
-										...(llmStepTraceHooks?.executionOptions ?? {}),
+								const priorMessages = activeStream.messageList.get.all.aiV5.model();
+								const nudgeInput = buildNudgeStreamInput(priorMessages);
+								const nudge = await subAgent.stream(nudgeInput, {
+									maxSteps: MAX_STEPS.BROWSER,
+									abortSignal: context.abortSignal,
+									providerOptions: {
+										anthropic: { cacheControl: { type: 'ephemeral' } },
 									},
-								);
+									...(llmStepTraceHooks?.executionOptions ?? {}),
+								});
 								activeStream = nudge;
 								activeMastraRunId =
 									(typeof nudge.runId === 'string' && nudge.runId) ||
--- a/packages/@n8n/instance-ai/src/tools/workflows.tool.ts
+++ b/packages/@n8n/instance-ai/src/tools/workflows.tool.ts
@ -1,6 +1,7 @@
 /**
- * Consolidated workflows tool — list, get, get-as-code, delete, setup,
- * publish, unpublish, list-versions, get-version, restore-version, update-version.
+ * Consolidated workflows tool — list, get, get-as-code, delete/archive,
+ * unarchive, setup, publish, unpublish, list-versions, get-version,
+ * restore-version, update-version.
 */
 import { createTool } from '@mastra/core/tools';
 import type { WorkflowJSON } from '@n8n/workflow-sdk';
@ -23,6 +24,12 @@ const listAction = z.object({
 	action: z.literal('list').describe('List workflows accessible to the current user'),
 	query: z.string().optional().describe('Filter workflows by name'),
 	limit: z.number().int().positive().max(100).optional().describe('Max results to return'),
+	status: z
+		.enum(['active', 'archived', 'all'])
+		.optional()
+		.describe(
+			'Which workflows to list. Defaults to active; use archived to find workflows that can be restored.',
+		),
 });

 const getAction = z.object({
@ -38,7 +45,14 @@ const getAsCodeAction = z.object({
 const deleteAction = z.object({
 	action: z
 		.literal('delete')
-		.describe('Archive a workflow by ID (soft delete — recoverable by the user)'),
+		.describe('Archive a workflow by ID. This is reversible with the unarchive action.'),
+	workflowId: z.string().describe('ID of the workflow'),
+});
+
+const unarchiveAction = z.object({
+	action: z
+		.literal('unarchive')
+		.describe('Restore an archived workflow by ID without publishing it'),
 	workflowId: z.string().describe('ID of the workflow'),
 });

@ -99,10 +113,13 @@ const updateVersionAction = z.object({

 // ── Suspend / resume schemas ────────────────────────────────────────────────

-// Setup suspend is a superset of the standard confirmation suspend (has
-// requestId, message, severity plus extra fields), so we use it as the base.
-// Add optional fields so the union covers both standard and setup payloads.
-const suspendSchema = setupSuspendSchema;
+const confirmationSuspendSchema = setupSuspendSchema.pick({
+	requestId: true,
+	message: true,
+	severity: true,
+});
+
+const suspendSchema = z.union([setupSuspendSchema, confirmationSuspendSchema]);

 // Resume: union of standard confirmation (approved) and setup-specific fields.
 const resumeSchema = setupResumeSchema;
@ -116,6 +133,7 @@ type Input =
 	| z.infer<typeof getAction>
 	| z.infer<typeof getAsCodeAction>
 	| z.infer<typeof deleteAction>
+	| z.infer<typeof unarchiveAction>
 	| z.infer<typeof setupAction>
 	| z.infer<typeof publishExtendedAction>
 	| z.infer<typeof unpublishAction>
@ -134,6 +152,7 @@ function buildInputSchema(context: InstanceAiContext, surface: 'full' | 'orchest
 		listAction,
 		getAction,
 		deleteAction,
+		unarchiveAction,
 		setupAction,
 		hasNamedVersions ? publishExtendedAction : publishBaseAction,
 		unpublishAction,
@ -174,6 +193,7 @@ async function handleList(context: InstanceAiContext, input: Extract<Input, { ac
 	const workflows = await context.workflowService.list({
 		limit: input.limit,
 		query: input.query,
+		...(input.status ? { status: input.status } : {}),
 	});
 	return { workflows };
 }
@ -207,7 +227,7 @@ async function handleDelete(
 	ctx: { agent?: { resumeData?: unknown; suspend?: unknown } },
 ) {
 	const resumeData = ctx?.agent?.resumeData as z.infer<typeof resumeSchema> | undefined;
-	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<void>) | undefined;
+	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<unknown>) | undefined;

 	if (context.permissions?.deleteWorkflow === 'blocked') {
 		return { success: false, denied: true, reason: 'Action blocked by admin' };
@ -218,13 +238,12 @@ async function handleDelete(
 	// First call — suspend for confirmation (unless always_allow)
 	if (needsApproval && (resumeData === undefined || resumeData === null)) {
 		const workflowName = await resolveWorkflowName(context, input.workflowId);
-		await suspend?.({
+		const suspension = await suspend?.({
 			requestId: nanoid(),
 			message: `Archive workflow "${workflowName}" (ID: ${input.workflowId})? This will deactivate it if needed and can be undone later.`,
 			severity: 'warning' as const,
 		});
-		// suspend() never resolves — this line is unreachable but satisfies the type checker
-		return { success: false };
+		return suspension ?? { success: false, denied: true, reason: 'Awaiting confirmation' };
 	}

 	// Denied
@ -236,6 +255,38 @@ async function handleDelete(
 	return { success: true };
 }

+async function handleUnarchive(
+	context: InstanceAiContext,
+	input: Extract<Input, { action: 'unarchive' }>,
+	ctx: { agent?: { resumeData?: unknown; suspend?: unknown } },
+) {
+	const resumeData = ctx?.agent?.resumeData as z.infer<typeof resumeSchema> | undefined;
+	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<unknown>) | undefined;
+
+	if (context.permissions?.deleteWorkflow === 'blocked') {
+		return { success: false, denied: true, reason: 'Action blocked by admin' };
+	}
+
+	const needsApproval = context.permissions?.deleteWorkflow !== 'always_allow';
+
+	if (needsApproval && (resumeData === undefined || resumeData === null)) {
+		const workflowName = await resolveWorkflowName(context, input.workflowId);
+		const suspension = await suspend?.({
+			requestId: nanoid(),
+			message: `Restore archived workflow "${workflowName}" (ID: ${input.workflowId})? This will make it visible again but will not publish it.`,
+			severity: 'warning' as const,
+		});
+		return suspension ?? { success: false, denied: true, reason: 'Awaiting confirmation' };
+	}
+
+	if (resumeData !== undefined && resumeData !== null && !resumeData.approved) {
+		return { success: false, denied: true, reason: 'User denied the action' };
+	}
+
+	await context.workflowService.unarchive(input.workflowId);
+	return { success: true };
+}
+
 async function handleSetup(
 	context: InstanceAiContext,
 	input: Extract<Input, { action: 'setup' }>,
@ -243,7 +294,7 @@ async function handleSetup(
 	state: { currentRequestId: string | null; preTestSnapshot: WorkflowJSON | null },
 ) {
 	const resumeData = ctx?.agent?.resumeData as z.infer<typeof setupResumeSchema> | undefined;
-	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<void>) | undefined;
+	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<unknown>) | undefined;

 	// State 1: Analyze workflow and suspend for user setup
 	if (resumeData === undefined || resumeData === null) {
@ -439,7 +490,7 @@ async function handlePublish(
 	ctx: { agent?: { resumeData?: unknown; suspend?: unknown } },
 ) {
 	const resumeData = ctx?.agent?.resumeData as { approved: boolean } | undefined;
-	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<void>) | undefined;
+	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<unknown>) | undefined;
 	const hasNamedVersions = !!context.workflowService.updateVersion;

 	if (context.permissions?.publishWorkflow === 'blocked') {
@ -451,14 +502,14 @@ async function handlePublish(
 	if (needsApproval && (resumeData === undefined || resumeData === null)) {
 		const workflowName = await resolveWorkflowName(context, input.workflowId);

-		await suspend?.({
+		const suspension = await suspend?.({
 			requestId: nanoid(),
 			message: input.versionId
 				? `Publish version "${input.versionId}" of workflow "${workflowName}" (ID: ${input.workflowId})?`
 				: `Publish workflow "${workflowName}" (ID: ${input.workflowId})?`,
 			severity: 'warning' as const,
 		});
-		return { success: false };
+		return suspension ?? { success: false, denied: true, reason: 'Awaiting confirmation' };
 	}

 	if (resumeData !== undefined && resumeData !== null && !resumeData.approved) {
@ -490,7 +541,7 @@ async function handleUnpublish(
 	ctx: { agent?: { resumeData?: unknown; suspend?: unknown } },
 ) {
 	const resumeData = ctx?.agent?.resumeData as { approved: boolean } | undefined;
-	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<void>) | undefined;
+	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<unknown>) | undefined;

 	if (context.permissions?.publishWorkflow === 'blocked') {
 		return { success: false, denied: true, reason: 'Action blocked by admin' };
@ -500,12 +551,12 @@ async function handleUnpublish(

 	if (needsApproval && (resumeData === undefined || resumeData === null)) {
 		const workflowName = await resolveWorkflowName(context, input.workflowId);
-		await suspend?.({
+		const suspension = await suspend?.({
 			requestId: nanoid(),
 			message: `Unpublish workflow "${workflowName}" (ID: ${input.workflowId})?`,
 			severity: 'warning' as const,
 		});
-		return { success: false };
+		return suspension ?? { success: false, denied: true, reason: 'Awaiting confirmation' };
 	}

 	if (resumeData !== undefined && resumeData !== null && !resumeData.approved) {
@ -547,7 +598,7 @@ async function handleRestoreVersion(
 	ctx: { agent?: { resumeData?: unknown; suspend?: unknown } },
 ) {
 	const resumeData = ctx?.agent?.resumeData as { approved: boolean } | undefined;
-	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<void>) | undefined;
+	const suspend = ctx?.agent?.suspend as ((payload: unknown) => Promise<unknown>) | undefined;

 	if (context.permissions?.restoreWorkflowVersion === 'blocked') {
 		return { success: false, denied: true, reason: 'Action blocked by admin' };
@ -565,12 +616,12 @@ async function handleRestoreVersion(
 			? `"${version.name}" (${timestamp})`
 			: `"${input.versionId}" (${timestamp ?? 'unknown date'})`;

-		await suspend?.({
+		const suspension = await suspend?.({
 			requestId: nanoid(),
 			message: `Restore workflow to version ${versionLabel}? This will overwrite the current draft.`,
 			severity: 'warning' as const,
 		});
-		return { success: false };
+		return suspension ?? { success: false, denied: true, reason: 'Awaiting confirmation' };
 	}

 	if (resumeData !== undefined && resumeData !== null && !resumeData.approved) {
@ -616,7 +667,7 @@ export function createWorkflowsTool(
 	return createTool({
 		id: 'workflows',
 		description:
-			'Manage workflows — list, inspect, delete, set up, publish, unpublish, and manage versions.',
+			'Manage workflows — list, inspect, archive, restore, set up, publish, unpublish, and manage versions. Workflow results use activeVersionId: null for unpublished workflows.',
 		inputSchema,
 		suspendSchema,
 		resumeSchema,
@ -630,6 +681,8 @@ export function createWorkflowsTool(
 					return await handleGetAsCode(context, input);
 				case 'delete':
 					return await handleDelete(context, input, ctx);
+				case 'unarchive':
+					return await handleUnarchive(context, input, ctx);
 				case 'setup':
 					return await handleSetup(context, input, ctx, setupState);
 				case 'publish':
--- a/packages/@n8n/instance-ai/src/tools/workflows/tests/setup-workflow.service.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/workflows/tests/setup-workflow.service.test.ts
@ -24,7 +24,7 @@ function createMockContext(overrides?: Partial<InstanceAiContext>): InstanceAiCo
 			createFromWorkflowJSON: jest.fn(),
 			updateFromWorkflowJSON: jest.fn(),
 			archive: jest.fn(),
-			delete: jest.fn(),
+			unarchive: jest.fn(),
 			publish: jest.fn(),
 			unpublish: jest.fn(),
 			clearAiTemporary: jest.fn(),
--- a/packages/@n8n/instance-ai/src/tools/workflows/tests/submit-workflow.tool.test.ts
+++ b/packages/@n8n/instance-ai/src/tools/workflows/tests/submit-workflow.tool.test.ts
@ -16,7 +16,6 @@ jest.mock('@mastra/core/tools', () => ({

 jest.mock('@n8n/workflow-sdk', () => ({
 	validateWorkflow: jest.fn(() => ({ errors: [], warnings: [] })),
-	layoutWorkflowJSON: jest.fn((wf: unknown) => wf),
 }));

 // `require` (rather than `import`) is needed because `submit-workflow.tool`
--- a/packages/@n8n/instance-ai/src/tools/workflows/build-workflow.tool.ts
+++ b/packages/@n8n/instance-ai/src/tools/workflows/build-workflow.tool.ts
@ -1,5 +1,5 @@
 import { createTool } from '@mastra/core/tools';
-import { generateWorkflowCode, layoutWorkflowJSON } from '@n8n/workflow-sdk';
+import { generateWorkflowCode } from '@n8n/workflow-sdk';
 import { z } from 'zod';

 import { buildCredentialMap, resolveCredentials } from './resolve-credentials';
@ -149,9 +149,7 @@ export function createBuildWorkflowTool(context: InstanceAiContext) {
 				};
 			}

-			// Apply Dagre layout to produce positions matching the FE's tidy-up.
-			// Temporary: remove once the SDK is published with toJSON({ tidyUp: true }).
-			const json = layoutWorkflowJSON(result.workflow);
+			const json = result.workflow;
 			if (name) {
 				json.name = name;
 			} else if (!json.name && !workflowId) {
--- a/packages/@n8n/instance-ai/src/tools/workflows/submit-workflow.tool.ts
+++ b/packages/@n8n/instance-ai/src/tools/workflows/submit-workflow.tool.ts
@ -10,7 +10,7 @@ import { createTool } from '@mastra/core/tools';
 import type { Workspace } from '@mastra/core/workspace';
 import { hasPlaceholderDeep } from '@n8n/utils';
 import type { WorkflowJSON } from '@n8n/workflow-sdk';
-import { validateWorkflow, layoutWorkflowJSON } from '@n8n/workflow-sdk';
+import { validateWorkflow } from '@n8n/workflow-sdk';
 import { createHash, randomUUID } from 'node:crypto';
 import { z } from 'zod';

@ -398,10 +398,7 @@ export function createSubmitWorkflowTool(
 				};
 			}

-			// Apply Dagre layout to produce positions matching the FE's tidy-up.
-			// Temporary: until the SDK is published with toJSON({ tidyUp: true }) support,
-			// the sandbox's SDK doesn't have Dagre layout, so we apply it server-side.
-			const json = layoutWorkflowJSON(buildOutput.workflow);
+			const json = buildOutput.workflow;
 			if (name) {
 				json.name = name;
 			} else if (!json.name && !workflowId) {
--- a/packages/@n8n/instance-ai/src/types.ts
+++ b/packages/@n8n/instance-ai/src/types.ts
@ -20,6 +20,7 @@ import type { GenericValue, INodeTypes } from 'n8n-workflow';
 import type { DomainAccessTracker } from './domain-access/domain-access-tracker';
 import type { InstanceAiEventBus } from './event-bus/event-bus.interface';
 import type { Logger } from './logger';
+import type { McpClientManager } from './mcp/mcp-client-manager';
 import type { BuilderSandboxSessionRegistry } from './runtime/builder-sandbox-session-registry';
 import type { IterationLog } from './storage/iteration-log';
 import type { IdRemapper, TraceIndex, TraceWriter } from './tracing/trace-replay';
@ -38,6 +39,7 @@ export interface WorkflowSummary {
 	name: string;
 	versionId: string;
 	activeVersionId: string | null;
+	isArchived: boolean;
 	createdAt: string;
 	updatedAt: string;
 	tags?: string[];
@ -148,8 +150,14 @@ export interface WorkflowVersionDetail extends WorkflowVersionSummary {
 	connections: Record<string, unknown>;
 }

+export type WorkflowListStatus = 'active' | 'archived' | 'all';
+
 export interface InstanceAiWorkflowService {
-	list(options?: { query?: string; limit?: number }): Promise<WorkflowSummary[]>;
+	list(options?: {
+		query?: string;
+		limit?: number;
+		status?: WorkflowListStatus;
+	}): Promise<WorkflowSummary[]>;
 	get(workflowId: string): Promise<WorkflowDetail>;
 	/** Get the workflow as the SDK's WorkflowJSON (full node data for generateWorkflowCode). */
 	getAsWorkflowJSON(workflowId: string): Promise<WorkflowJSON>;
@ -165,7 +173,7 @@ export interface InstanceAiWorkflowService {
 		options?: { projectId?: string },
 	): Promise<WorkflowDetail>;
 	archive(workflowId: string): Promise<void>;
-	delete(workflowId: string): Promise<void>;
+	unarchive(workflowId: string): Promise<void>;
 	/**
 	 * Clear the AI-builder temporary marker on a workflow — used to promote the
 	 * main deliverable so the run-finish reap leaves it alone.
@ -545,9 +553,13 @@ export interface InstanceAiWorkspaceService {
 // ── Local gateway status ─────────────────────────────────────────────────────

 export type LocalGatewayStatus =
-	| { status: 'connected' }
-	| { status: 'disconnected'; capabilities: string[] }
-	| { status: 'disabled' };
+	| {
+			status: 'connected';
+			capabilities: string[];
+	  }
+	| {
+			status: 'disabledGlobally' | 'disconnected' | 'disabled';
+	  };

 // ── Context bundle ───────────────────────────────────────────────────────────

@ -981,6 +993,8 @@ export interface OrchestrationContext {
 	oauth2CallbackUrl?: string;
 	/** Webhook base URL for the n8n instance (e.g. http://localhost:5678/webhook) — used to construct webhook URLs for created workflows */
 	webhookBaseUrl?: string;
+	/** Form base URL for the n8n instance (e.g. http://localhost:5678/form) — distinct from webhookBaseUrl since Form Triggers serve at /form/, not /webhook/ */
+	formBaseUrl?: string;
 	/** Spawn a detached background task that outlives the current orchestrator run */
 	spawnBackgroundTask?: (opts: SpawnBackgroundTaskOptions) => SpawnBackgroundTaskResult;
 	/** Cancel a running background task by its ID */
@ -1042,6 +1056,8 @@ export interface CreateInstanceAgentOptions {
 	context: InstanceAiContext;
 	orchestrationContext?: OrchestrationContext;
 	mcpServers?: McpServerConfig[];
+	/** Owns MCP client connections + tool listing caches; the service passes its singleton in. */
+	mcpManager: McpClientManager;
 	memoryConfig: InstanceAiMemoryConfig;
 	/** Pre-built Memory instance. When provided, `memoryConfig` is ignored for memory creation. */
 	memory?: Memory;
--- a/packages/@n8n/instance-ai/src/workflow-builder/parse-validate.ts
+++ b/packages/@n8n/instance-ai/src/workflow-builder/parse-validate.ts
@ -78,7 +78,7 @@ export function parseAndValidate(
 		collectValidationIssues(graphValidation.errors, allWarnings);
 		collectValidationIssues(graphValidation.warnings, allWarnings);

-		const json = builder.toJSON();
+		const json = builder.toJSON({ tidyUp: true });

 		// Stage 2: Schema validation via Zod schemas from schemaBaseDirs.
 		// strictMode is hardcoded on at AI-builder call sites — we want every
--- a/packages/@n8n/instance-ai/src/workspace/sandbox-setup.ts
+++ b/packages/@n8n/instance-ai/src/workspace/sandbox-setup.ts
@ -169,7 +169,7 @@ try {
    process.exit(1);
  }
  const validation = wf.validate();
-  const json = wf.toJSON();
+  const json = wf.toJSON({ tidyUp: true });
  const warnings = [...(validation.errors || []), ...(validation.warnings || [])];
  // Use a replacer to preserve undefined values as null — newCredential() produces
  // NewCredentialImpl which serializes to undefined in toJSON(). Without this,
--- a/packages/@n8n/nodes-langchain/nodes/output_parser/OutputParserStructured/test/OutputParserStructured.node.test.ts
+++ b/packages/@n8n/nodes-langchain/nodes/output_parser/OutputParserStructured/test/OutputParserStructured.node.test.ts
@ -822,11 +822,11 @@ describe('OutputParserStructured', () => {
 					.calledWith('prompt', 0, NAIVE_FIX_PROMPT)
 					.mockReturnValueOnce('Invalid prompt without error placeholder');

-				await expect(outputParser.supplyData.call(thisArg, 0)).rejects.toThrow(
-					new NodeOperationError(
-						thisArg.getNode(),
-						'Auto-fixing parser prompt has to contain {error} placeholder',
-					),
+				const execution = outputParser.supplyData.call(thisArg, 0);
+
+				await expect(execution).rejects.toThrow(NodeOperationError);
+				await expect(execution).rejects.toThrow(
+					'Auto-fixing parser prompt has to contain {error} placeholder',
 				);
 			});

--- a/packages/@n8n/nodes-langchain/nodes/vector_store/VectorStoreRedis/VectorStoreRedis.node.test.ts
+++ b/packages/@n8n/nodes-langchain/nodes/vector_store/VectorStoreRedis/VectorStoreRedis.node.test.ts
@ -505,12 +505,11 @@ describe('VectorStoreRedis.node', () => {
 			} as any;

 			const node = new RedisNode.VectorStoreRedis();
-			await expect((node as any).populateVectorStore(context, {}, [], 0)).rejects.toEqual(
-				new NodeOperationError(context.getNode(), 'Error: fail', {
-					itemIndex: 0,
-					description: 'Please check your index/schema and parameters',
-				}),
-			);
+
+			const execution = (node as any).populateVectorStore(context, {}, [], 0);
+
+			await expect(execution).rejects.toThrow(NodeOperationError);
+			await expect(execution).rejects.toThrow('Error: fail');

 			expect(loadOptionsFunctions.logger.info).toHaveBeenCalledWith(
 				'Error while populating the store: fail',
--- a/packages/@n8n/nodes-langchain/utils/helpers.ts
+++ b/packages/@n8n/nodes-langchain/utils/helpers.ts
@ -200,13 +200,17 @@ export const getConnectedTools = async (
 		0,
 	)) as SupplyDataToolResponse[];

-	// Get parent nodes to map toolkits to their source nodes
+	// Get parent nodes to map toolkits to their source nodes.
+	// getInputConnectionData filters out disabled nodes, so parents must be filtered
+	// the same way to keep the index alignment between toolkitConnections and parentNodes.
 	const parentNodes =
 		'getParentNodes' in ctx
-			? ctx.getParentNodes(ctx.getNode().name, {
-					connectionType: NodeConnectionTypes.AiTool,
-					depth: 1,
-				})
+			? ctx
+					.getParentNodes(ctx.getNode().name, {
+						connectionType: NodeConnectionTypes.AiTool,
+						depth: 1,
+					})
+					.filter((node) => !node.disabled)
 			: [];

 	const connectedTools = (toolkitConnections ?? [])
--- a/packages/@n8n/nodes-langchain/utils/tests/helpers.test.ts
+++ b/packages/@n8n/nodes-langchain/utils/tests/helpers.test.ts
@ -341,6 +341,38 @@ describe('getConnectedTools', () => {
 			sourceNodeName: 'MCP Client Tool',
 		});
 	});
+
+	it('should map source node names correctly when a disabled tool node is still connected', async () => {
+		// getParentNodes returns ALL parents including disabled ones,
+		// while getInputConnectionData filters disabled nodes out.
+		// getConnectedTools must skip disabled parents to keep the index in sync.
+		const mockParentNodes = [
+			{ name: 'Tool Alpha', disabled: false },
+			{ name: 'Tool Bravo', disabled: true },
+			{ name: 'Tool Charlie', disabled: false },
+		];
+		const mockTools = [
+			{ name: 'alpha', description: 'desc-alpha' },
+			{ name: 'charlie', description: 'desc-charlie' },
+		];
+
+		mockExecuteFunctions.getInputConnectionData = jest.fn().mockResolvedValue(mockTools);
+		mockExecuteFunctions.getParentNodes = jest.fn().mockReturnValue(mockParentNodes);
+
+		const tools = await getConnectedTools(mockExecuteFunctions, false);
+
+		expect(tools).toHaveLength(2);
+		expect(tools[0].name).toBe('alpha');
+		expect(tools[0].metadata).toEqual({
+			isFromToolkit: false,
+			sourceNodeName: 'Tool Alpha',
+		});
+		expect(tools[1].name).toBe('charlie');
+		expect(tools[1].metadata).toEqual({
+			isFromToolkit: false,
+			sourceNodeName: 'Tool Charlie',
+		});
+	});
 });

 describe('unwrapNestedOutput', () => {
--- a/packages/@n8n/workflow-sdk/src/index.ts
+++ b/packages/@n8n/workflow-sdk/src/index.ts
@ -148,9 +148,6 @@ export { runOnceForAllItems, runOnceForEachItem } from './utils/code-helpers';
 // Utility functions
 export { isPlainObject, getProperty, hasProperty } from './utils/safe-access';

-// Layout
-export { layoutWorkflowJSON } from './workflow-builder/layout-utils';
-
 // Validation
 export {
 	validateWorkflow,
--- a/packages/@n8n/workflow-sdk/src/workflow-builder/layout-utils.test.ts
+++ b/packages/@n8n/workflow-sdk/src/workflow-builder/layout-utils.test.ts
@ -383,5 +383,41 @@ describe('calculateNodePositionsDagre', () => {
 			const positions2 = calculateNodePositionsDagre(nodes2);
 			expect(positions2.has('remote-note')).toBe(false);
 		});
+
+		it('preserves explicit positions and anchors new nodes around them', () => {
+			const nodes = new Map<string, GraphNode>();
+			const triggerConns = makeMainConns([[0, [makeTarget('set')]]]);
+
+			nodes.set(
+				'trigger',
+				createGraphNode('trigger', 'n8n-nodes-base.manualTrigger', triggerConns, [500, 600]),
+			);
+			nodes.set('set', createGraphNode('set', 'n8n-nodes-base.set'));
+
+			const positions = calculateNodePositionsDagre(nodes);
+
+			// Explicit position is not overwritten (function only returns positions for unpositioned nodes)
+			expect(positions.has('trigger')).toBe(false);
+			// New node gets a position from dagre
+			expect(positions.has('set')).toBe(true);
+		});
+
+		it('reanchors sticky notes using explicit positions of covered nodes', () => {
+			const nodes = new Map<string, GraphNode>();
+			const triggerConns = makeMainConns([[0, [makeTarget('set')]]]);
+
+			nodes.set(
+				'trigger',
+				createGraphNode('trigger', 'n8n-nodes-base.manualTrigger', triggerConns, [500, 600]),
+			);
+			nodes.set('set', createGraphNode('set', 'n8n-nodes-base.set'));
+			// Sticky overlapping the explicitly positioned trigger
+			nodes.set('note', createGraphNode('note', STICKY_NODE_TYPE, undefined, [500, 600]));
+
+			const positions = calculateNodePositionsDagre(nodes);
+
+			// Sticky is reanchored relative to trigger's explicit position, not dagre's guess
+			expect(positions.get('note')).toEqual([496, 672]);
+		});
 	});
 });
--- a/packages/@n8n/workflow-sdk/src/workflow-builder/layout-utils.ts
+++ b/packages/@n8n/workflow-sdk/src/workflow-builder/layout-utils.ts
@ -4,7 +4,7 @@
 * Two layout strategies:
 * 1. BFS layout (calculateNodePositions) — simple left-to-right BFS, used by default toJSON()
 * 2. Dagre layout (calculateNodePositionsDagre) — mirrors the FE's useCanvasLayout algorithm,
- *    used by toJSON({ tidyUp: true }) and layoutWorkflowJSON()
+ *    used by toJSON({ tidyUp: true })
 */

 import dagre from '@dagrejs/dagre';
@ -27,7 +27,7 @@ import {
 	DEFAULT_Y,
 	START_X,
 } from './constants';
-import type { GraphNode, WorkflowJSON, ConnectionTarget } from '../types/base';
+import type { GraphNode } from '../types/base';

 // ===========================================================================
 // BFS Layout (default)
@ -465,7 +465,12 @@ export function calculateNodePositionsDagre(

 	for (const name of nonStickyNames) {
 		const { width, height } = getNodeDimensions(name, aiParentNames, aiConfigNames, nodes);
-		parentGraph.setNode(name, { width, height });
+		const explicitPosition = nodes.get(name)?.instance.config?.position;
+		parentGraph.setNode(name, {
+			width,
+			height,
+			...(explicitPosition ? { x: explicitPosition[0], y: explicitPosition[1] } : {}),
+		});
 	}

 	// Add edges from connections
@ -637,8 +642,23 @@ export function calculateNodePositionsDagre(
 		}

 		const positionsAfter = new Map<string, BoundingBox>();
-		for (const [name, box] of Object.entries(boundingBoxByNodeId)) {
-			positionsAfter.set(name, box);
+		for (const [name, graphNode] of nodes) {
+			const explicitPosition = graphNode.instance.config?.position;
+			if (explicitPosition) {
+				const { width, height } = getNodeDimensions(name, aiParentNames, aiConfigNames, nodes);
+				positionsAfter.set(name, {
+					x: explicitPosition[0],
+					y: explicitPosition[1],
+					width,
+					height,
+				});
+				continue;
+			}
+
+			const box = boundingBoxByNodeId[name];
+			if (box) {
+				positionsAfter.set(name, box);
+			}
 		}

 		repositionStickyNotes(stickyNames, nonStickyNames, positionsBefore, positionsAfter, positions);
@ -646,79 +666,3 @@ export function calculateNodePositionsDagre(

 	return positions;
 }
-
-// ===========================================================================
-// WorkflowJSON layout (operates on serialized workflow, not builder graph)
-// ===========================================================================
-
-/**
- * Return a new WorkflowJSON with Dagre-computed node positions.
- * Builds a GraphNode map from the serialized JSON and delegates to calculateNodePositionsDagre.
- *
- * Pure function — does not mutate the input.
- *
- * This is the entry point for code paths that receive pre-built WorkflowJSON
- * (e.g., sandbox-compiled workflows in instance-ai) and need proper layout
- * before the SDK is published with tidyUp support.
- */
-export function layoutWorkflowJSON(json: WorkflowJSON): WorkflowJSON {
-	const jsonNodes = json.nodes;
-	if (!jsonNodes || jsonNodes.length === 0) return json;
-
-	const connections = json.connections ?? {};
-
-	// Build a GraphNode map from WorkflowJSON
-	const graphNodes = new Map<string, GraphNode>();
-
-	for (const node of jsonNodes) {
-		if (!node.name) continue;
-		const connectionsMap = new Map<string, Map<number, ConnectionTarget[]>>();
-		connectionsMap.set('main', new Map());
-		graphNodes.set(node.name, {
-			instance: {
-				type: node.type,
-				name: node.name,
-				version: node.typeVersion,
-				config: {},
-			} as unknown as GraphNode['instance'],
-			connections: connectionsMap,
-		});
-	}
-
-	// Populate connections from WorkflowJSON connections structure
-	for (const [sourceName, nodeConns] of Object.entries(connections)) {
-		const graphNode = graphNodes.get(sourceName);
-		if (!graphNode) continue;
-
-		for (const [connType, outputs] of Object.entries(nodeConns)) {
-			if (!Array.isArray(outputs)) continue;
-			let outputMap = graphNode.connections.get(connType);
-			if (!outputMap) {
-				outputMap = new Map();
-				graphNode.connections.set(connType, outputMap);
-			}
-			for (let outputIdx = 0; outputIdx < outputs.length; outputIdx++) {
-				const slot = outputs[outputIdx];
-				if (!Array.isArray(slot)) continue;
-				const targets: ConnectionTarget[] = slot
-					.filter((t): t is { node: string; type: string; index: number } => !!t?.node)
-					.map((t) => ({ node: t.node, type: t.type, index: t.index }));
-				if (targets.length > 0) {
-					outputMap.set(outputIdx, targets);
-				}
-			}
-		}
-	}
-
-	// Calculate positions using the Dagre layout
-	const positions = calculateNodePositionsDagre(graphNodes);
-
-	// Return new WorkflowJSON with updated positions
-	return {
-		...json,
-		nodes: jsonNodes.map((node) => {
-			const pos = node.name ? positions.get(node.name) : undefined;
-			return pos ? { ...node, position: pos } : node;
-		}),
-	};
-}
--- a/packages/cli/src/commands/import/tests/workflow.test.ts
+++ b/packages/cli/src/commands/import/tests/workflow.test.ts
@ -0,0 +1,77 @@
+import { mockInstance } from '@n8n/backend-test-utils';
+import { GlobalConfig } from '@n8n/config';
+import { Container } from '@n8n/di';
+
+import '@/zod-alias-support';
+import { ImportService } from '@/services/import.service';
+
+import { ImportWorkflowsCommand } from '../workflow';
+
+jest.mock('@/services/import.service');
+
+describe('ImportWorkflowsCommand', () => {
+	mockInstance(ImportService);
+
+	const globalConfig = Container.get(GlobalConfig);
+	const originalMode = globalConfig.executions.mode;
+
+	afterEach(() => {
+		globalConfig.executions.mode = originalMode;
+	});
+
+	const buildCommand = () => {
+		const command = new ImportWorkflowsCommand();
+		// @ts-expect-error Protected property
+		command.logger = {
+			info: jest.fn(),
+			error: jest.fn(),
+		};
+		return command;
+	};
+
+	describe('--activeState flag', () => {
+		it('throws when n8n is not running in queue mode and activeState is set to "fromJson"', async () => {
+			globalConfig.executions.mode = 'regular';
+
+			const command = buildCommand();
+			// @ts-expect-error Protected property
+			command.flags = {
+				input: './workflows.json',
+				separate: false,
+				activeState: 'fromJson',
+			};
+
+			await expect(command.run()).rejects.toThrow(
+				'The "--activeState=fromJson" flag can only be used when n8n is running in queue or multi-main mode. In regular deployment mode, workflow activation is not supported.',
+			);
+		});
+
+		it('does not throw on the queue-mode guard when running in queue mode', async () => {
+			globalConfig.executions.mode = 'queue';
+
+			const command = buildCommand();
+			// @ts-expect-error Protected property
+			command.flags = {
+				// `input` intentionally missing so `run` returns early after the guard
+				// without us needing to mock filesystem/repositories.
+				separate: false,
+				activeState: 'fromJson',
+			};
+
+			await expect(command.run()).resolves.toBeUndefined();
+		});
+
+		it('does not throw when activeState is "false", regardless of mode', async () => {
+			globalConfig.executions.mode = 'regular';
+
+			const command = buildCommand();
+			// @ts-expect-error Protected property
+			command.flags = {
+				separate: false,
+				activeState: 'false',
+			};
+
+			await expect(command.run()).resolves.toBeUndefined();
+		});
+	});
+});
--- a/packages/cli/src/commands/import/workflow.ts
+++ b/packages/cli/src/commands/import/workflow.ts
@ -74,6 +74,16 @@ const flagsSchema = z.object({
 		.string()
 		.describe('The ID of the project to assign the imported workflows to')
 		.optional(),
+	activeState: z
+		.enum(['false', 'fromJson'], {
+			errorMap: () => ({
+				message: 'Valid values for flag "--activeState" are only "false" or "fromJson".',
+			}),
+		})
+		.describe(
+			'Whether to respect the JSON active field. "false" (default) deactivates all imported workflows. "fromJson" activates/deactivates each workflow based on its JSON active field.',
+		)
+		.default('false'),
 });

@Command({
@ -85,6 +95,7 @@ const flagsSchema = z.object({
 		'--input=file.json --userId=1d64c3d2-85fe-4a83-a649-e446b07b3aae',
 		'--input=file.json --projectId=Ox8O54VQrmBrb4qL',
 		'--separate --input=backups/latest/ --userId=1d64c3d2-85fe-4a83-a649-e446b07b3aae',
+		'--input=file.json --activeState=fromJson',
 	],
 	flagsSchema,
 })
@ -92,6 +103,12 @@ export class ImportWorkflowsCommand extends BaseCommand<z.infer<typeof flagsSche
 	async run(): Promise<void> {
 		const { flags } = this;

+		if (flags.activeState === 'fromJson' && this.globalConfig.executions.mode !== 'queue') {
+			throw new UserError(
+				'The "--activeState=fromJson" flag can only be used when n8n is running in queue or multi-main mode. In regular deployment mode, workflow activation is not supported.',
+			);
+		}
+
 		if (!flags.input) {
 			this.logger.info('An input file or directory with --input must be provided');
 			return;
@ -124,7 +141,9 @@ export class ImportWorkflowsCommand extends BaseCommand<z.infer<typeof flagsSche

 		this.logger.info(`Importing ${workflows.length} workflows...`);

-		await Container.get(ImportService).importWorkflows(workflows, project.id);
+		await Container.get(ImportService).importWorkflows(workflows, project.id, {
+			activeState: flags.activeState,
+		});

 		this.reportSuccess(workflows.length);
 	}
--- a/packages/cli/src/controllers/tests/me.controller.test.ts
+++ b/packages/cli/src/controllers/tests/me.controller.test.ts
@ -19,8 +19,16 @@ import { License } from '@/license';
 import { MfaService } from '@/mfa/mfa.service';
 import type { MeRequest } from '@/requests';
 import { UserService } from '@/services/user.service';
+import { getCurrentAuthenticationMethod } from '@/sso.ee/sso-helpers';
 import { badPasswords } from '@test/test-data';

+jest.mock('@/sso.ee/sso-helpers', () => ({
+	...jest.requireActual('@/sso.ee/sso-helpers'),
+	getCurrentAuthenticationMethod: jest.fn(),
+}));
+
+const getCurrentAuthenticationMethodMock = getCurrentAuthenticationMethod as jest.Mock;
+
 const browserId = 'test-browser-id';

 describe('MeController', () => {
@ -34,26 +42,26 @@ describe('MeController', () => {
 	const controller = Container.get(MeController);

 	beforeEach(() => {
-		// Default: user has no SSO identities (email-based auth)
 		userService.findSsoIdentity.mockResolvedValue(undefined);
+		getCurrentAuthenticationMethodMock.mockReturnValue('email');
 	});

 	describe('updateCurrentUser', () => {
 		it('should update the user in the DB, and issue a new cookie', async () => {
-			const user = mock<User>({
+			const user = {
 				id: '123',
 				email: 'valid@email.com',
 				password: 'password',
 				authIdentities: [],
 				role: GLOBAL_OWNER_ROLE,
 				mfaEnabled: false,
-			});
+			} as unknown as User;
 			const payload = new UserUpdateRequestDto({
 				email: 'valid@email.com',
 				firstName: 'John',
 				lastName: 'Potato',
 			});
-			const req = mock<AuthenticatedRequest>({ user, browserId });
+			const req = { user, browserId } as unknown as AuthenticatedRequest;
 			const res = mock<Response>();
 			userRepository.findOneByOrFail.mockResolvedValue(user);
 			userService.findUserWithAuthIdentities.mockResolvedValue(user);
@ -91,15 +99,15 @@ describe('MeController', () => {
 		});

 		it('should throw BadRequestError if beforeUpdate hook throws BadRequestError', async () => {
-			const user = mock<User>({
+			const user = {
 				id: '123',
 				password: 'password',
 				email: 'current@email.com',
 				authIdentities: [],
 				role: GLOBAL_OWNER_ROLE,
 				mfaEnabled: false,
-			});
-			const req = mock<AuthenticatedRequest>({ user });
+			} as unknown as User;
+			const req = { user } as unknown as AuthenticatedRequest;

 			externalHooks.run.mockImplementationOnce(async (hookName) => {
 				if (hookName === 'user.profile.beforeUpdate') {
@ -118,7 +126,7 @@ describe('MeController', () => {

 		describe('when user is authenticated via LDAP or OIDC', () => {
 			it('should throw BadRequestError when LDAP user tries to change their profile', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'ldap@email.com',
 					firstName: 'John',
@ -127,10 +135,13 @@ describe('MeController', () => {
 					authIdentities: [],
 					role: GLOBAL_OWNER_ROLE,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;

-				userService.findSsoIdentity.mockResolvedValue(mock<AuthIdentity>({ providerType: 'ldap' }));
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'ldap',
+				} as unknown as AuthIdentity);
+				getCurrentAuthenticationMethodMock.mockReturnValue('ldap');

 				await expect(
 					controller.updateCurrentUser(
@ -148,7 +159,7 @@ describe('MeController', () => {
 			});

 			it('should throw BadRequestError when OIDC user tries to change their profile', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'oidc@email.com',
 					firstName: 'John',
@ -157,10 +168,13 @@ describe('MeController', () => {
 					authIdentities: [],
 					role: GLOBAL_OWNER_ROLE,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;

-				userService.findSsoIdentity.mockResolvedValue(mock<AuthIdentity>({ providerType: 'oidc' }));
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'oidc',
+				} as unknown as AuthIdentity);
+				getCurrentAuthenticationMethodMock.mockReturnValue('oidc');

 				await expect(
 					controller.updateCurrentUser(
@ -178,20 +192,20 @@ describe('MeController', () => {
 			});

 			it('should allow non-LDAP/OIDC users to update their profile', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'valid@email.com',
 					password: 'password',
 					authIdentities: [],
 					role: GLOBAL_OWNER_ROLE,
 					mfaEnabled: false,
-				});
+				} as unknown as User;
 				const payload = new UserUpdateRequestDto({
 					email: 'valid@email.com',
 					firstName: 'John',
 					lastName: 'Potato',
 				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
 				const res = mock<Response>();

 				userRepository.findOneByOrFail.mockResolvedValue(user);
@ -205,7 +219,7 @@ describe('MeController', () => {
 			});

 			it('should block user with multiple identities if one is LDAP', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'multi@email.com',
 					firstName: 'John',
@ -214,11 +228,14 @@ describe('MeController', () => {
 					authIdentities: [],
 					role: GLOBAL_OWNER_ROLE,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;

 				// User has multiple identities, one of which is LDAP - findSsoIdentity returns the SSO one
-				userService.findSsoIdentity.mockResolvedValue(mock<AuthIdentity>({ providerType: 'ldap' }));
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'ldap',
+				} as unknown as AuthIdentity);
+				getCurrentAuthenticationMethodMock.mockReturnValue('ldap');

 				await expect(
 					controller.updateCurrentUser(
@ -236,17 +253,256 @@ describe('MeController', () => {
 			});
 		});

+		describe('when an auth_identity exists but the SSO provider is no longer active', () => {
+			const passwordHash = '$2a$10$ffitcKrHT.Ls.m9FfWrMrOod76aaI0ogKbc3S96Q320impWpCbgj6'; // Hashed 'old_password'
+
+			const setUpdateMocks = (user: User) => {
+				userRepository.findOneByOrFail.mockResolvedValue(user);
+				userService.findUserWithAuthIdentities.mockResolvedValue(user);
+				jest.spyOn(jwt, 'sign').mockImplementation(() => 'signed-token');
+				userService.toPublic.mockResolvedValue({} as unknown as PublicUser);
+			};
+
+			it('should throw BadRequestError when SAML user tries to change their profile while SAML is enabled', async () => {
+				const user = {
+					id: '123',
+					email: 'saml@email.com',
+					firstName: 'John',
+					lastName: 'Doe',
+					password: 'password',
+					authIdentities: [],
+					role: GLOBAL_OWNER_ROLE,
+					mfaEnabled: false,
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
+
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'saml',
+				} as unknown as AuthIdentity);
+				getCurrentAuthenticationMethodMock.mockReturnValue('saml');
+
+				await expect(
+					controller.updateCurrentUser(
+						req,
+						mock(),
+						new UserUpdateRequestDto({
+							email: 'saml@email.com',
+							firstName: 'Jane',
+							lastName: 'Doe',
+						}),
+					),
+				).rejects.toThrowError(
+					new BadRequestError('SAML user may not change their profile information'),
+				);
+			});
+
+			it('should allow profile update when SAML auth_identity exists but SAML is disabled', async () => {
+				const user = {
+					id: '123',
+					email: 'saml@email.com',
+					firstName: 'John',
+					lastName: 'Doe',
+					password: 'password',
+					authIdentities: [],
+					role: GLOBAL_OWNER_ROLE,
+					mfaEnabled: false,
+				} as unknown as User;
+				const payload = new UserUpdateRequestDto({
+					email: 'saml@email.com',
+					firstName: 'NewFirst',
+					lastName: 'NewLast',
+				});
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
+				const res = mock<Response>();
+
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'saml',
+				} as unknown as AuthIdentity);
+				setUpdateMocks(user);
+
+				await controller.updateCurrentUser(req, res, payload);
+
+				expect(userService.update).toHaveBeenCalledWith(user.id, {
+					email: 'saml@email.com',
+					firstName: 'NewFirst',
+					lastName: 'NewLast',
+				});
+				expect(eventService.emit).toHaveBeenCalledWith('user-updated', {
+					user,
+					fieldsChanged: ['firstName', 'lastName'],
+				});
+			});
+
+			it('should allow profile update when LDAP auth_identity exists but LDAP is disabled', async () => {
+				const user = {
+					id: '123',
+					email: 'ldap@email.com',
+					firstName: 'John',
+					lastName: 'Doe',
+					password: 'password',
+					authIdentities: [],
+					role: GLOBAL_OWNER_ROLE,
+					mfaEnabled: false,
+				} as unknown as User;
+				const payload = new UserUpdateRequestDto({
+					email: 'ldap@email.com',
+					firstName: 'NewFirst',
+					lastName: 'NewLast',
+				});
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
+				const res = mock<Response>();
+
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'ldap',
+				} as unknown as AuthIdentity);
+				setUpdateMocks(user);
+
+				await controller.updateCurrentUser(req, res, payload);
+
+				expect(userService.update).toHaveBeenCalled();
+			});
+
+			it('should allow profile update when OIDC auth_identity exists but OIDC is disabled', async () => {
+				const user = {
+					id: '123',
+					email: 'oidc@email.com',
+					firstName: 'John',
+					lastName: 'Doe',
+					password: 'password',
+					authIdentities: [],
+					role: GLOBAL_OWNER_ROLE,
+					mfaEnabled: false,
+				} as unknown as User;
+				const payload = new UserUpdateRequestDto({
+					email: 'oidc@email.com',
+					firstName: 'NewFirst',
+					lastName: 'NewLast',
+				});
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
+				const res = mock<Response>();
+
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'oidc',
+				} as unknown as AuthIdentity);
+				setUpdateMocks(user);
+
+				await controller.updateCurrentUser(req, res, payload);
+
+				expect(userService.update).toHaveBeenCalled();
+			});
+
+			it('should allow email change for previously-SAML user once SAML is disabled', async () => {
+				const user = {
+					id: '123',
+					email: 'saml-old@email.com',
+					firstName: 'John',
+					lastName: 'Doe',
+					password: passwordHash,
+					authIdentities: [],
+					role: GLOBAL_OWNER_ROLE,
+					mfaEnabled: false,
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
+				const res = mock<Response>();
+
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'saml',
+				} as unknown as AuthIdentity);
+				setUpdateMocks(user);
+
+				await controller.updateCurrentUser(
+					req,
+					res,
+					new UserUpdateRequestDto({
+						email: 'saml-new@email.com',
+						firstName: 'John',
+						lastName: 'Doe',
+						currentPassword: 'old_password',
+					}),
+				);
+
+				expect(userService.update).toHaveBeenCalled();
+			});
+
+			it('should allow profile update when providerType is token-exchange', async () => {
+				const user = {
+					id: '123',
+					email: 'token@email.com',
+					firstName: 'John',
+					lastName: 'Doe',
+					password: 'password',
+					authIdentities: [],
+					role: GLOBAL_OWNER_ROLE,
+					mfaEnabled: false,
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
+				const res = mock<Response>();
+
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'token-exchange',
+				} as unknown as AuthIdentity);
+				setUpdateMocks(user);
+
+				await controller.updateCurrentUser(
+					req,
+					res,
+					new UserUpdateRequestDto({
+						email: 'token@email.com',
+						firstName: 'NewFirst',
+						lastName: 'NewLast',
+					}),
+				);
+
+				expect(userService.update).toHaveBeenCalled();
+			});
+
+			it('should bypass the SSO guard when no profile fields are changing', async () => {
+				const user = {
+					id: '123',
+					email: 'unchanged@email.com',
+					firstName: 'Same',
+					lastName: 'Name',
+					password: 'password',
+					authIdentities: [],
+					role: GLOBAL_OWNER_ROLE,
+					mfaEnabled: false,
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
+				const res = mock<Response>();
+
+				userService.findSsoIdentity.mockClear();
+				userService.findSsoIdentity.mockResolvedValue({
+					providerType: 'saml',
+				} as unknown as AuthIdentity);
+				getCurrentAuthenticationMethodMock.mockReturnValue('saml');
+				setUpdateMocks(user);
+
+				await controller.updateCurrentUser(
+					req,
+					res,
+					new UserUpdateRequestDto({
+						email: 'unchanged@email.com',
+						firstName: 'Same',
+						lastName: 'Name',
+					}),
+				);
+
+				expect(userService.findSsoIdentity).not.toHaveBeenCalled();
+				expect(userService.update).toHaveBeenCalled();
+			});
+		});
+
 		describe('when mfa is enabled', () => {
 			it('should throw BadRequestError if mfa code is missing', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'valid@email.com',
 					password: 'password',
 					authIdentities: [],
 					role: GLOBAL_OWNER_ROLE,
 					mfaEnabled: true,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;

 				await expect(
 					controller.updateCurrentUser(
@ -262,15 +518,15 @@ describe('MeController', () => {
 			});

 			it('should throw InvalidMfaCodeError if mfa code is invalid', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'valid@email.com',
 					password: 'password',
 					authIdentities: [],
 					role: GLOBAL_OWNER_ROLE,
 					mfaEnabled: true,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
 				mockMfaService.validateMfa.mockResolvedValue(false);

 				await expect(
@ -288,7 +544,7 @@ describe('MeController', () => {
 			});

 			it("should update the user's email if mfa code is valid", async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'valid@email.com',
 					password: 'password',
@ -296,8 +552,8 @@ describe('MeController', () => {
 					role: GLOBAL_OWNER_ROLE,
 					mfaEnabled: true,
 					mfaSecret: 'secret',
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
 				const res = mock<Response>();
 				userRepository.findOneByOrFail.mockResolvedValue(user);
 				userService.findUserWithAuthIdentities.mockResolvedValue(user);
@ -325,13 +581,13 @@ describe('MeController', () => {
 			const passwordHash = '$2a$10$ffitcKrHT.Ls.m9FfWrMrOod76aaI0ogKbc3S96Q320impWpCbgj6'; // Hashed 'old_password'

 			it('should throw BadRequestError if currentPassword is missing', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'michel-old@email.com',
 					password: passwordHash,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;

 				await expect(
 					controller.updateCurrentUser(
@ -347,13 +603,13 @@ describe('MeController', () => {
 			});

 			it('should throw BadRequestError if currentPassword is not a string', async () => {
-				const user = mock<User>({
+				const user = {
 					id: '123',
 					email: 'michel-old@email.com',
 					password: passwordHash,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;

 				await expect(
 					controller.updateCurrentUser(req, mock(), {
@ -366,12 +622,12 @@ describe('MeController', () => {
 			});

 			it('should throw BadRequestError if currentPassword is incorrect', async () => {
-				const user = mock<User>({
+				const user = {
 					email: 'michel-old@email.com',
 					password: passwordHash,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;

 				await expect(
 					controller.updateCurrentUser(
@ -392,12 +648,13 @@ describe('MeController', () => {
 			});

 			it('should update the user email if currentPassword is correct', async () => {
-				const user = mock<User>({
+				const user = {
 					email: 'michel-old@email.com',
 					password: passwordHash,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+					role: GLOBAL_OWNER_ROLE,
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
 				const res = mock<Response>();
 				userRepository.findOneByOrFail.mockResolvedValue(user);
 				userService.findUserWithAuthIdentities.mockResolvedValue(user);
@ -420,12 +677,13 @@ describe('MeController', () => {
 			});

 			it('should not require currentPassword when email is not being changed', async () => {
-				const user = mock<User>({
+				const user = {
 					email: 'michel@email.com',
 					password: passwordHash,
 					mfaEnabled: false,
-				});
-				const req = mock<AuthenticatedRequest>({ user, browserId });
+					role: GLOBAL_OWNER_ROLE,
+				} as unknown as User;
+				const req = { user, browserId } as unknown as AuthenticatedRequest;
 				const res = mock<Response>();
 				userRepository.findOneByOrFail.mockResolvedValue(user);
 				userService.findUserWithAuthIdentities.mockResolvedValue(user);
@ -462,13 +720,13 @@ describe('MeController', () => {
 		});

 		it('should reject profile update for env-managed user', async () => {
-			const user = mock<User>({
+			const user = {
 				id: '123',
 				email: 'managed@example.com',
 				password: 'password',
 				role: GLOBAL_OWNER_ROLE,
-			});
-			const req = mock<AuthenticatedRequest>({ user, browserId });
+			} as unknown as User;
+			const req = { user, browserId } as unknown as AuthenticatedRequest;

 			await expect(
 				controller.updateCurrentUser(
@ -484,12 +742,12 @@ describe('MeController', () => {
 		});

 		it('should reject password update for env-managed user', async () => {
-			const req = mock<AuthenticatedRequest>({
-				user: mock<User>({
+			const req = {
+				user: {
 					email: 'managed@example.com',
 					password: '$2a$10$ffitcKrHT.Ls.m9FfWrMrOod76aaI0ogKbc3S96Q320impWpCbgj6',
-				}),
-			});
+				} as unknown as User,
+			} as unknown as AuthenticatedRequest;

 			await expect(
 				controller.updatePassword(
@ -505,15 +763,15 @@ describe('MeController', () => {
 		});

 		it('should allow profile update for non-env-managed owner', async () => {
-			const user = mock<User>({
+			const user = {
 				id: '456',
 				email: 'other-owner@example.com',
 				password: 'password',
 				authIdentities: [],
 				role: GLOBAL_OWNER_ROLE,
 				mfaEnabled: false,
-			});
-			const req = mock<AuthenticatedRequest>({ user, browserId });
+			} as unknown as User;
+			const req = { user, browserId } as unknown as AuthenticatedRequest;
 			const res = mock<Response>();
 			userRepository.findOneByOrFail.mockResolvedValue(user);
 			userService.findUserWithAuthIdentities.mockResolvedValue(user);
@ -534,18 +792,18 @@ describe('MeController', () => {
 		const passwordHash = '$2a$10$ffitcKrHT.Ls.m9FfWrMrOod76aaI0ogKbc3S96Q320impWpCbgj6'; // Hashed 'old_password'

 		it('should throw if the user does not have a password set', async () => {
-			const req = mock<AuthenticatedRequest>({
+			const req = {
 				user: mock({ password: undefined }),
-			});
+			} as unknown as AuthenticatedRequest;
 			await expect(
 				controller.updatePassword(req, mock(), mock({ currentPassword: '', newPassword: '' })),
 			).rejects.toThrowError(new BadRequestError('Requesting user not set up.'));
 		});

 		it("should throw if currentPassword does not match the user's password", async () => {
-			const req = mock<AuthenticatedRequest>({
+			const req = {
 				user: mock({ password: passwordHash }),
-			});
+			} as unknown as AuthenticatedRequest;
 			await expect(
 				controller.updatePassword(
 					req,
@ -558,10 +816,10 @@ describe('MeController', () => {
 		describe('should throw if newPassword is not valid', () => {
 			Object.entries(badPasswords).forEach(([newPassword, errorMessage]) => {
 				it(newPassword, async () => {
-					const req = mock<AuthenticatedRequest>({
+					const req = {
 						user: mock({ password: passwordHash }),
 						browserId,
-					});
+					} as unknown as AuthenticatedRequest;
 					await expect(
 						controller.updatePassword(
 							req,
@ -574,10 +832,10 @@ describe('MeController', () => {
 		});

 		it('should update the password in the DB, and issue a new cookie', async () => {
-			const req = mock<AuthenticatedRequest>({
+			const req = {
 				user: mock({ password: passwordHash, mfaEnabled: false }),
 				browserId,
-			});
+			} as unknown as AuthenticatedRequest;
 			const res = mock<Response>();
 			userRepository.save.calledWith(req.user).mockResolvedValue(req.user);
 			jest.spyOn(jwt, 'sign').mockImplementation(() => 'new-signed-token');
@ -614,9 +872,9 @@ describe('MeController', () => {

 		describe('mfa enabled', () => {
 			it('should throw BadRequestError if mfa code is missing', async () => {
-				const req = mock<AuthenticatedRequest>({
+				const req = {
 					user: mock({ password: passwordHash, mfaEnabled: true }),
-				});
+				} as unknown as AuthenticatedRequest;

 				await expect(
 					controller.updatePassword(
@ -630,9 +888,9 @@ describe('MeController', () => {
 			});

 			it('should throw InvalidMfaCodeError if invalid mfa code is given', async () => {
-				const req = mock<AuthenticatedRequest>({
+				const req = {
 					user: mock({ password: passwordHash, mfaEnabled: true }),
-				});
+				} as unknown as AuthenticatedRequest;
 				mockMfaService.validateMfa.mockResolvedValue(false);

 				await expect(
@ -649,10 +907,10 @@ describe('MeController', () => {
 			});

 			it('should succeed when mfa code is correct', async () => {
-				const req = mock<AuthenticatedRequest>({
+				const req = {
 					user: mock({ password: passwordHash, mfaEnabled: true, mfaSecret: 'secret' }),
 					browserId,
-				});
+				} as unknown as AuthenticatedRequest;
 				const res = mock<Response>();
 				userRepository.save.calledWith(req.user).mockResolvedValue(req.user);
 				jest.spyOn(jwt, 'sign').mockImplementation(() => 'new-signed-token');
--- a/packages/cli/src/controllers/me.controller.ts
+++ b/packages/cli/src/controllers/me.controller.ts
@ -6,7 +6,7 @@ import {
 } from '@n8n/api-types';
 import { Logger } from '@n8n/backend-common';
 import { GlobalConfig } from '@n8n/config';
-import type { User, PublicUser } from '@n8n/db';
+import type { User, PublicUser, AuthIdentity } from '@n8n/db';
 import { UserRepository, AuthenticatedRequest } from '@n8n/db';
 import { Body, createUserKeyedRateLimiter, Patch, Post, RestController } from '@n8n/decorators';
 import { plainToInstance } from 'class-transformer';
@ -23,7 +23,7 @@ import { MfaService } from '@/mfa/mfa.service';
 import { MeRequest } from '@/requests';
 import { PasswordUtility } from '@/services/password.utility';
 import { UserService } from '@/services/user.service';
-import { isSamlLicensedAndEnabled } from '@/sso.ee/sso-helpers';
+import { getCurrentAuthenticationMethod, isSamlLicensedAndEnabled } from '@/sso.ee/sso-helpers';

 import { PersonalizationSurveyAnswersV4 } from './survey-answers.dto';

@ -73,7 +73,7 @@ export class MeController {
 		if (isEmailBeingChanged || isFirstNameChanged || isLastNameChanged) {
 			const ssoIdentity = await this.userService.findSsoIdentity(userId);

-			if (ssoIdentity) {
+			if (ssoIdentity && this.isAuthIdentityActive(ssoIdentity)) {
 				this.logger.debug(
 					`Request to update user failed because ${ssoIdentity.providerType} user may not change their profile information`,
 					{
@ -180,6 +180,10 @@ export class MeController {
 		);
 	}

+	private isAuthIdentityActive(authIdentity: AuthIdentity) {
+		return authIdentity.providerType === getCurrentAuthenticationMethod();
+	}
+
 	/**
 	 * Update the logged-in user's password.
 	 */
--- a/packages/cli/src/eventbus/message-event-bus/message-event-bus.ts
+++ b/packages/cli/src/eventbus/message-event-bus/message-event-bus.ts
@ -275,12 +275,14 @@ export class MessageEventBus extends EventEmitter {
 		await this.send(new EventMessageQueue(options));
 	}

-	// eslint-disable-next-line complexity
+	/**
+	 * Does the following at startup:
+	 * - checks for unsent messages in the log files and tries to resend them
+	 * - cycles event logs and start the logging to a fresh file
+	 * - checks for unfinished executions (executions for which we have events in the log files,
+	 *   but no final execution event) and tries to recover them if needed
+	 */
 	private async performStartupRecovery() {
-		// unsent event check:
-		// - find unsent messages in current event log(s)
-		// - cycle event logs and start the logging to a fresh file
-		// - retry sending events
 		this.logger.debug('Checking for unsent event messages');
 		const unsentAndUnfinished = await this.getUnsentAndUnfinishedExecutions();
 		this.logger.debug(
@ -289,80 +291,102 @@ export class MessageEventBus extends EventEmitter {
 		this.logWriter?.startLogging();
 		await this.send(unsentAndUnfinished.unsentMessages);

-		let unfinishedExecutionIds = Object.keys(unsentAndUnfinished.unfinishedExecutions);
+		const unfinishedExecutionIds = await this.collectUnfinishedExecutionIds(
+			unsentAndUnfinished.unfinishedExecutions,
+		);
+		if (unfinishedExecutionIds.length === 0) {
+			return;
+		}
+
+		await this.logActiveWorkflows();
+
+		const recoveryAlreadyAttempted = this.logWriter?.isRecoveryProcessRunning();
+		if (recoveryAlreadyAttempted || this.globalConfig.eventBus.crashRecoveryMode === 'simple') {
+			await this.executionRepository.markAsCrashed(unfinishedExecutionIds);
+			// if we end up here, it means that the previous recovery process did not finish
+			// a possible reason would be that recreating the workflow data itself caused e.g an OOM error
+			// in that case, we do not want to retry the recovery process, but rather mark the executions as crashed
+			if (recoveryAlreadyAttempted)
+				this.logger.warn('Skipped recovery process since it previously failed.');
+		} else {
+			// start actual recovery process and write recovery process flag file
+			this.logWriter?.startRecoveryProcess();
+			const recoveredIds: string[] = [];
+			const crashedWorkflowIds: Set<string> = new Set();
+
+			for (const executionId of unfinishedExecutionIds) {
+				const logMessages = unsentAndUnfinished.unfinishedExecutions[executionId];
+				const recoveredExecution = await this.recoveryService.recoverFromLogs(
+					executionId,
+					logMessages ?? [],
+				);
+				if (recoveredExecution) {
+					if (recoveredExecution.status === 'crashed') {
+						crashedWorkflowIds.add(recoveredExecution.workflowId);
+					}
+					recoveredIds.push(executionId);
+				}
+			}
+
+			if (recoveredIds.length > 0) {
+				this.logger.warn(`Found unfinished executions: ${recoveredIds.join(', ')}`);
+				this.logger.info('This could be due to a crash of an active workflow or a restart of n8n');
+			}
+
+			if (
+				this.globalConfig.executions.recovery.workflowDeactivationEnabled &&
+				crashedWorkflowIds.size > 0
+			) {
+				await this.recoveryService.autoDeactivateWorkflowsIfNeeded(crashedWorkflowIds);
+			}
+		}
+
+		// remove the recovery process flag file
+		this.logWriter?.endRecoveryProcess();
+	}
+
+	/**
+	 * Logs the currently active workflows
+	 */
+	private async logActiveWorkflows() {
+		const activeWorkflows = await this.workflowRepository.find({
+			where: { activeVersionId: Not(IsNull()) },
+			select: ['id', 'name'],
+		});
+
+		if (activeWorkflows.length > 0) {
+			this.logger.info('Currently active workflows:');
+			for (const workflowData of activeWorkflows) {
+				this.logger.info(`   - ${workflowData.name} (ID: ${workflowData.id})`);
+			}
+		}
+	}
+
+	/**
+	 * Collects the execution ids of all unfinished executions. This includes all executions
+	 * for which we have events in the log files, but no final execution event, as well as
+	 * all executions in the database with status 'running' or 'unknown' (if we are not in queue mode).
+	 */
+	private async collectUnfinishedExecutionIds(
+		unfinishedExecutions: Record<string, EventMessageTypes[] | undefined>,
+	): Promise<string[]> {
+		const unfinishedExecutionIds = Object.keys(unfinishedExecutions);

 		// if we are in queue mode, running jobs may still be running on a worker despite the main process
 		// crashing, so we can't just mark them as crashed
-		if (this.globalConfig.executions.mode !== 'queue') {
-			const dbUnfinishedExecutionIds = (
-				await this.executionRepository.find({
-					where: {
-						status: In(['running', 'unknown']),
-					},
-					select: ['id'],
-				})
-			).map((e) => e.id);
-			unfinishedExecutionIds = Array.from(
-				new Set<string>([...unfinishedExecutionIds, ...dbUnfinishedExecutionIds]),
-			);
+		if (this.globalConfig.executions.mode === 'queue') {
+			return unfinishedExecutionIds;
 		}

-		if (unfinishedExecutionIds.length > 0) {
-			const activeWorkflows = await this.workflowRepository.find({
-				where: { activeVersionId: Not(IsNull()) },
-				select: ['id', 'name'],
-			});
-			if (activeWorkflows.length > 0) {
-				this.logger.info('Currently active workflows:');
-				for (const workflowData of activeWorkflows) {
-					this.logger.info(`   - ${workflowData.name} (ID: ${workflowData.id})`);
-				}
-			}
-			const recoveryAlreadyAttempted = this.logWriter?.isRecoveryProcessRunning();
-			if (recoveryAlreadyAttempted || this.globalConfig.eventBus.crashRecoveryMode === 'simple') {
-				await this.executionRepository.markAsCrashed(unfinishedExecutionIds);
-				// if we end up here, it means that the previous recovery process did not finish
-				// a possible reason would be that recreating the workflow data itself caused e.g an OOM error
-				// in that case, we do not want to retry the recovery process, but rather mark the executions as crashed
-				if (recoveryAlreadyAttempted)
-					this.logger.warn('Skipped recovery process since it previously failed.');
-			} else {
-				// start actual recovery process and write recovery process flag file
-				this.logWriter?.startRecoveryProcess();
-				const recoveredIds: string[] = [];
-				const crashedWorkflowIds: Set<string> = new Set();
+		const dbUnfinishedExecutions = await this.executionRepository.find({
+			where: {
+				status: In(['running', 'unknown']),
+			},
+			select: ['id'],
+		});

-				for (const executionId of unfinishedExecutionIds) {
-					const logMessages = unsentAndUnfinished.unfinishedExecutions[executionId];
-					const recoveredExecution = await this.recoveryService.recoverFromLogs(
-						executionId,
-						logMessages ?? [],
-					);
-					if (recoveredExecution) {
-						if (recoveredExecution.status === 'crashed') {
-							crashedWorkflowIds.add(recoveredExecution.workflowId);
-						}
-						recoveredIds.push(executionId);
-					}
-				}
-
-				if (recoveredIds.length > 0) {
-					this.logger.warn(`Found unfinished executions: ${recoveredIds.join(', ')}`);
-					this.logger.info(
-						'This could be due to a crash of an active workflow or a restart of n8n',
-					);
-				}
-
-				if (
-					this.globalConfig.executions.recovery.workflowDeactivationEnabled &&
-					crashedWorkflowIds.size > 0
-				) {
-					await this.recoveryService.autoDeactivateWorkflowsIfNeeded(crashedWorkflowIds);
-				}
-			}
-
-			// remove the recovery process flag file
-			this.logWriter?.endRecoveryProcess();
-		}
+		return Array.from(
+			new Set([...unfinishedExecutionIds, ...dbUnfinishedExecutions.map((e) => e.id)]),
+		);
 	}
 }
--- a/packages/cli/src/events/maps/relay.event-map.ts
+++ b/packages/cli/src/events/maps/relay.event-map.ts
@ -949,4 +949,12 @@ export type RelayEventMap = {
 	};

 	// #endregion
+
+	// #region Instance AI
+
+	'instance-ai-settings-updated': {
+		mcpSettingsChanged: boolean;
+	};
+
+	// #endregion
 } & AiEventMap;
--- a/packages/cli/src/modules/instance-ai/tests/credit-counting.test.ts
+++ b/packages/cli/src/modules/instance-ai/tests/credit-counting.test.ts
@ -5,6 +5,8 @@ jest.mock('@n8n/instance-ai', () => {
 	const { z } = jest.requireActual<{ z: typeof zType }>('zod');
 	return {
 		McpClientManager: class {
+			getRegularTools = jest.fn().mockResolvedValue({});
+			getBrowserTools = jest.fn().mockResolvedValue({});
 			disconnect = jest.fn();
 		},
 		createDomainAccessTracker: jest.fn(),
--- a/packages/cli/src/modules/instance-ai/tests/instance-ai-settings.service.test.ts
+++ b/packages/cli/src/modules/instance-ai/tests/instance-ai-settings.service.test.ts
@ -3,6 +3,7 @@ import type { SettingsRepository, User, UserRepository } from '@n8n/db';
 import { mock } from 'jest-mock-extended';

 import { UnprocessableRequestError } from '@/errors/response-errors/unprocessable.error';
+import type { EventService } from '@/events/event.service';
 import type { AiService } from '@/services/ai.service';
 import type { UserService } from '@/services/user.service';
 import type { CredentialsFinderService } from '@/credentials/credentials-finder.service';
@ -39,6 +40,7 @@ describe('InstanceAiSettingsService', () => {
 	const aiService = mock<AiService>();
 	const credentialsService = mock<CredentialsService>();
 	const credentialsFinderService = mock<CredentialsFinderService>();
+	const eventService = mock<EventService>();

 	let service: InstanceAiSettingsService;

@ -52,6 +54,7 @@ describe('InstanceAiSettingsService', () => {
 			aiService,
 			credentialsService,
 			credentialsFinderService,
+			eventService,
 		);
 	});

@ -93,6 +96,58 @@ describe('InstanceAiSettingsService', () => {
 		});
 	});

+	describe('instance-ai-settings-updated event', () => {
+		beforeEach(() => {
+			aiService.isProxyEnabled.mockReturnValue(false);
+			settingsRepository.upsert.mockResolvedValue(undefined as never);
+			globalConfig.instanceAi.mcpServers = '';
+			globalConfig.instanceAi.browserMcp = false;
+		});
+
+		it('emits on every successful update', async () => {
+			await service.updateAdminSettings({ lastMessages: 50 });
+
+			expect(eventService.emit).toHaveBeenCalledWith(
+				'instance-ai-settings-updated',
+				expect.any(Object),
+			);
+		});
+
+		it('flags mcpSettingsChanged when mcpServers changes', async () => {
+			await service.updateAdminSettings({ mcpServers: '[{"name":"a","url":"https://a/"}]' });
+
+			expect(eventService.emit).toHaveBeenCalledWith('instance-ai-settings-updated', {
+				mcpSettingsChanged: true,
+			});
+		});
+
+		it('flags mcpSettingsChanged when browserMcp toggles', async () => {
+			await service.updateAdminSettings({ browserMcp: true });
+
+			expect(eventService.emit).toHaveBeenCalledWith('instance-ai-settings-updated', {
+				mcpSettingsChanged: true,
+			});
+		});
+
+		it('does not flag mcpSettingsChanged for unrelated field changes', async () => {
+			await service.updateAdminSettings({ lastMessages: 50 });
+
+			expect(eventService.emit).toHaveBeenCalledWith('instance-ai-settings-updated', {
+				mcpSettingsChanged: false,
+			});
+		});
+
+		it('does not flag mcpSettingsChanged when mcpServers is set to the same value', async () => {
+			globalConfig.instanceAi.mcpServers = '[{"name":"a","url":"https://a/"}]';
+
+			await service.updateAdminSettings({ mcpServers: '[{"name":"a","url":"https://a/"}]' });
+
+			expect(eventService.emit).toHaveBeenCalledWith('instance-ai-settings-updated', {
+				mcpSettingsChanged: false,
+			});
+		});
+	});
+
 	describe('updateUserPreferences', () => {
 		const user = mock<User>({ id: 'user-1' });

--- a/packages/cli/src/modules/instance-ai/tests/instance-ai.adapter.service.test.ts
+++ b/packages/cli/src/modules/instance-ai/tests/instance-ai.adapter.service.test.ts
@ -1308,6 +1308,9 @@ function createWorkflowAdapterForTests(overrides?: {
 		id: 'wf-new',
 		name: 'Test Workflow',
 		active: false,
+		versionId: 'version-id',
+		activeVersionId: null,
+		isArchived: false,
 		createdAt: new Date('2026-01-01'),
 		updatedAt: new Date('2026-01-01'),
 		nodes: [],
@ -1348,9 +1351,13 @@ function createWorkflowAdapterForTests(overrides?: {
 	};

 	const mockWorkflowService = {
-		archive: jest.fn().mockResolvedValue(undefined),
+		getMany: jest.fn().mockResolvedValue({ workflows: [savedWorkflow] }),
+		archive: jest.fn().mockResolvedValue(savedWorkflow),
+		unarchive: jest.fn().mockResolvedValue(savedWorkflow),
+		activateWorkflow: jest.fn().mockResolvedValue({ activeVersionId: 'version-1' }),
 		update: jest.fn().mockResolvedValue(savedWorkflow),
 	};
+	const mockTelemetry = { track: jest.fn() };

 	const mockUser = { id: 'user-1', role: { slug: 'global:member' } } as unknown as User;

@ -1403,7 +1410,7 @@ function createWorkflowAdapterForTests(overrides?: {
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[26],
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[27],
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[28],
-		{ track: jest.fn() } as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[29],
+		mockTelemetry as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[29],
 		mockAiBuilderTemporaryWorkflowRepository as unknown as AiBuilderTemporaryWorkflowRepository,
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[31],
 	);
@ -1420,6 +1427,7 @@ function createWorkflowAdapterForTests(overrides?: {
 		mockSharedWorkflowRepository,
 		mockAiBuilderTemporaryWorkflowRepository,
 		mockWorkflowService,
+		mockTelemetry,
 		mockUser,
 	};
 }
@ -1436,6 +1444,50 @@ describe('createWorkflowAdapter', () => {
 		mockedUserHasScopes.mockResolvedValue(true);
 	});

+	it('lists active workflows by default', async () => {
+		const { adapter, mockWorkflowService, mockUser } = createWorkflowAdapterForTests();
+
+		const result = await adapter.list({ limit: 10, query: 'Test' });
+
+		expect(mockWorkflowService.getMany).toHaveBeenCalledWith(mockUser, {
+			take: 10,
+			filter: {
+				isArchived: false,
+				query: 'Test',
+			},
+		});
+		expect(result).toEqual([
+			expect.objectContaining({
+				id: 'wf-new',
+				isArchived: false,
+			}),
+		]);
+	});
+
+	it('lists archived workflows when requested', async () => {
+		const { adapter, mockWorkflowService, mockUser } = createWorkflowAdapterForTests();
+
+		await adapter.list({ status: 'archived' });
+
+		expect(mockWorkflowService.getMany).toHaveBeenCalledWith(mockUser, {
+			take: 50,
+			filter: {
+				isArchived: true,
+			},
+		});
+	});
+
+	it('omits the archived filter when listing all workflows', async () => {
+		const { adapter, mockWorkflowService, mockUser } = createWorkflowAdapterForTests();
+
+		await adapter.list({ status: 'all' });
+
+		expect(mockWorkflowService.getMany).toHaveBeenCalledWith(mockUser, {
+			take: 50,
+			filter: {},
+		});
+	});
+
 	it('defaults to personal project when no projectId provided', async () => {
 		const { adapter, mockProjectRepository, mockSharedWorkflowRepository } =
 			createWorkflowAdapterForTests();
@ -1477,6 +1529,18 @@ describe('createWorkflowAdapter', () => {
 		).rejects.toThrow('User does not have the required permissions in this project');
 	});

+	it('tracks workflow id when publishing a builder workflow', async () => {
+		const { adapter, mockTelemetry } = createWorkflowAdapterForTests();
+
+		await adapter.publish('wf-new');
+
+		expect(mockTelemetry.track).toHaveBeenCalledWith('Builder published workflow', {
+			thread_id: 'thread-1',
+			workflow_id: 'wf-new',
+			executed_by: 'ai',
+		});
+	});
+
 	it('marks the workflow as AI-builder temporary when markAsAiTemporary is true', async () => {
 		const {
 			adapter,
@ -1560,6 +1624,35 @@ describe('createWorkflowAdapter', () => {
 		expect(mockAiBuilderTemporaryWorkflowRepository.unmark).toHaveBeenCalledWith('wf-archived');
 	});

+	it('unarchives a workflow', async () => {
+		const { adapter, mockWorkflowService } = createWorkflowAdapterForTests();
+
+		await adapter.unarchive('wf-1');
+
+		expect(mockWorkflowService.unarchive).toHaveBeenCalledWith(
+			expect.objectContaining({ id: 'user-1' }),
+			'wf-1',
+		);
+	});
+
+	it('throws when archive cannot find or access the workflow', async () => {
+		const { adapter, mockWorkflowService } = createWorkflowAdapterForTests();
+		mockWorkflowService.archive.mockResolvedValueOnce(undefined);
+
+		await expect(adapter.archive('wf-missing')).rejects.toThrow(
+			'Workflow wf-missing not found or not accessible',
+		);
+	});
+
+	it('throws when unarchive cannot find or access the workflow', async () => {
+		const { adapter, mockWorkflowService } = createWorkflowAdapterForTests();
+		mockWorkflowService.unarchive.mockResolvedValueOnce(undefined);
+
+		await expect(adapter.unarchive('wf-missing')).rejects.toThrow(
+			'Workflow wf-missing not found or not accessible',
+		);
+	});
+
 	describe('instance read-only mode', () => {
 		it('blocks createFromWorkflowJSON when branchReadOnly is true', async () => {
 			const { adapter } = createWorkflowAdapterForTests({ branchReadOnly: true });
@ -1577,10 +1670,10 @@ describe('createWorkflowAdapter', () => {
 			);
 		});

-		it('blocks delete when branchReadOnly is true', async () => {
+		it('blocks unarchive when branchReadOnly is true', async () => {
 			const { adapter } = createWorkflowAdapterForTests({ branchReadOnly: true });

-			await expect(adapter.delete('wf-1')).rejects.toThrow(
+			await expect(adapter.unarchive('wf-1')).rejects.toThrow(
 				'Cannot modify workflows on a protected instance',
 			);
 		});
@ -1928,7 +2021,15 @@ describe('resolveDataTableByIdOrName', () => {
 // createExecutionAdapter – run() forces save settings
 // ---------------------------------------------------------------------------

-function createRunAdapterForTests(workflow: Record<string, unknown>) {
+function createRunAdapterForTests(
+	workflow: Record<string, unknown>,
+	options?: {
+		activeExecution?: boolean;
+		execution?: ReturnType<typeof makeExecution>;
+		postExecutePromise?: Promise<unknown>;
+		threadId?: string;
+	},
+) {
 	const mockWorkflowFinderService = {
 		findWorkflowForUser: jest.fn().mockResolvedValue(workflow),
 	};
@ -1938,12 +2039,17 @@ function createRunAdapterForTests(workflow: Record<string, unknown>) {
 	};

 	const mockActiveExecutions = {
-		has: jest.fn().mockReturnValue(false),
+		getPostExecutePromise: jest
+			.fn()
+			.mockReturnValue(options?.postExecutePromise ?? Promise.resolve()),
+		has: jest.fn().mockReturnValue(options?.activeExecution ?? false),
+		stopExecution: jest.fn(),
 	};

 	const mockExecutionRepository = {
-		findSingleExecution: jest.fn().mockResolvedValue(undefined),
+		findSingleExecution: jest.fn().mockResolvedValue(options?.execution),
 	};
+	const mockTelemetry = { track: jest.fn() };

 	const mockUser = { id: 'user-1', role: { slug: 'global:member' } } as unknown as User;

@ -1987,14 +2093,14 @@ function createRunAdapterForTests(workflow: Record<string, unknown>) {
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[26],
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[27],
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[28],
-		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[29],
+		mockTelemetry as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[29],
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[30],
 		{} as unknown as ConstructorParameters<typeof InstanceAiAdapterService>[31],
 	);

-	const adapter = service.createContext(mockUser).executionService;
+	const adapter = service.createContext(mockUser, { threadId: options?.threadId }).executionService;

-	return { adapter, mockWorkflowRunner };
+	return { adapter, mockActiveExecutions, mockTelemetry, mockWorkflowRunner };
 }

 describe('createExecutionAdapter run()', () => {
@ -2041,4 +2147,78 @@ describe('createExecutionAdapter run()', () => {
 			saveDataErrorExecution: 'all',
 		});
 	});
+
+	it('tracks workflow id and success status when a builder execution finishes', async () => {
+		const { adapter, mockTelemetry } = createRunAdapterForTests(
+			{
+				id: 'wf-1',
+				nodes: [],
+			},
+			{
+				execution: makeExecution({ status: 'success' }),
+				threadId: 'thread-1',
+			},
+		);
+
+		await adapter.run('wf-1');
+
+		expect(mockTelemetry.track).toHaveBeenCalledWith('Builder executed workflow', {
+			thread_id: 'thread-1',
+			workflow_id: 'wf-1',
+			executed_by: 'ai',
+			pinned_node_count: 0,
+			exec_type: 'manual',
+			status: 'success',
+		});
+	});
+
+	it('tracks error status when a builder execution fails', async () => {
+		const { adapter, mockTelemetry } = createRunAdapterForTests(
+			{
+				id: 'wf-1',
+				nodes: [],
+			},
+			{
+				execution: makeExecution({ status: 'error', error: { message: 'boom' } }),
+				threadId: 'thread-1',
+			},
+		);
+
+		await adapter.run('wf-1');
+
+		expect(mockTelemetry.track).toHaveBeenCalledWith(
+			'Builder executed workflow',
+			expect.objectContaining({
+				workflow_id: 'wf-1',
+				status: 'error',
+			}),
+		);
+	});
+
+	it('tracks timeout cancellation as an error status', async () => {
+		const { adapter, mockActiveExecutions, mockTelemetry } = createRunAdapterForTests(
+			{
+				id: 'wf-1',
+				nodes: [],
+			},
+			{
+				activeExecution: true,
+				postExecutePromise: new Promise(() => {}),
+				threadId: 'thread-1',
+			},
+		);
+
+		await expect(adapter.run('wf-1', undefined, { timeout: 1 })).resolves.toMatchObject({
+			status: 'error',
+		});
+
+		expect(mockActiveExecutions.stopExecution).toHaveBeenCalled();
+		expect(mockTelemetry.track).toHaveBeenCalledWith(
+			'Builder executed workflow',
+			expect.objectContaining({
+				workflow_id: 'wf-1',
+				status: 'error',
+			}),
+		);
+	});
 });
--- a/packages/cli/src/modules/instance-ai/tests/instance-ai.controller.test.ts
+++ b/packages/cli/src/modules/instance-ai/tests/instance-ai.controller.test.ts
@ -270,6 +270,10 @@ describe('InstanceAiController', () => {

 			await controller.events(sseReq, sseRes, THREAD_ID, { lastEventId: undefined } as never);

+			expect(instanceAiService.replayUndeliveredTerminalOutcomes).toHaveBeenCalledWith(THREAD_ID, {
+				delivery: 'event',
+			});
+
 			const runSyncFrame = (sseRes.write as jest.Mock).mock.calls
 				.map(([frame]) => String(frame))
 				.find((frame) => frame.startsWith('event: run-sync'));
@ -758,6 +762,7 @@ describe('InstanceAiController', () => {
 			const result = await controller.getThreadMessages(req, res, THREAD_ID, query);

 			expect(result).toMatchObject({ nextEventId: 42 });
+			expect(instanceAiService.replayUndeliveredTerminalOutcomes).toHaveBeenCalledWith(THREAD_ID);
 			expect(memoryService.getRichMessages).toHaveBeenCalledWith(USER_ID, THREAD_ID, {
 				limit: 50,
 				page: 0,
--- a/packages/cli/src/modules/instance-ai/tests/instance-ai.service.test.ts
+++ b/packages/cli/src/modules/instance-ai/tests/instance-ai.service.test.ts
@ -5,6 +5,8 @@ jest.mock('@n8n/instance-ai', () => {
 	const { z } = jest.requireActual<{ z: typeof zType }>('zod');
 	return {
 		McpClientManager: class {
+			getRegularTools = jest.fn().mockResolvedValue({});
+			getBrowserTools = jest.fn().mockResolvedValue({});
 			disconnect = jest.fn();
 		},
 		createDomainAccessTracker: jest.fn(),
@ -15,10 +17,99 @@ jest.mock('@n8n/instance-ai', () => {
 		workflowBuildOutcomeSchema: z.object({}),
 		handleBuildOutcome: jest.fn(),
 		handleVerificationVerdict: jest.fn(),
+		buildAgentTreeFromEvents: jest.fn(
+			(events: Array<{ type: string; payload?: { text?: string } }>) => ({
+				agentId: 'agent-001',
+				role: 'orchestrator',
+				status: 'completed',
+				textContent: events
+					.map((event) => (event.type === 'text-delta' ? (event.payload?.text ?? '') : ''))
+					.join(''),
+				reasoning: '',
+				toolCalls: [],
+				children: [],
+				timeline: [],
+			}),
+		),
 		createInstanceAgent: jest.fn(),
 		createAllTools: jest.fn(),
 		createMemory: jest.fn(),
 		mapMastraChunkToEvent: jest.fn(),
+		InstanceAiTerminalResponseGuard: class {
+			constructor(private readonly options: { runId: string; rootAgentId: string }) {}
+
+			evaluateTerminal(
+				_events: unknown[],
+				status: 'completed' | 'cancelled' | 'errored',
+				options: { errorMessage?: string } = {},
+			) {
+				if (status === 'errored') {
+					return {
+						status,
+						visibilitySource: 'none',
+						action: 'emit',
+						reason: 'errored-silent',
+						event: {
+							type: 'error',
+							runId: this.options.runId,
+							agentId: this.options.rootAgentId,
+							responseId: `terminal-fallback:${this.options.runId}:${status}`,
+							payload: {
+								content:
+									options.errorMessage ??
+									'I hit an error before I could finish that response. Please try again.',
+							},
+						},
+					};
+				}
+
+				return {
+					status,
+					visibilitySource: 'none',
+					action: 'emit',
+					reason: status === 'cancelled' ? 'cancelled-silent' : 'completed-silent',
+					event: {
+						type: 'text-delta',
+						runId: this.options.runId,
+						agentId: this.options.rootAgentId,
+						responseId: `terminal-fallback:${this.options.runId}:${status}`,
+						payload: { text: `fallback:${status}` },
+					},
+				};
+			}
+
+			evaluateWaiting(_events: unknown[], confirmationEvent?: { payload?: { message?: string } }) {
+				if (confirmationEvent?.payload?.message) {
+					return {
+						status: 'waiting',
+						visibilitySource: 'confirmation-ui',
+						action: 'none',
+						reason: 'confirmation-visible',
+					};
+				}
+
+				return {
+					status: 'waiting',
+					visibilitySource: 'none',
+					action: 'emit',
+					reason: 'confirmation-invalid',
+					event: {
+						type: 'error',
+						runId: this.options.runId,
+						agentId: this.options.rootAgentId,
+						responseId: `terminal-fallback:${this.options.runId}:waiting`,
+						payload: {
+							content:
+								'I need your input to continue, but I could not display the prompt. Please try again.',
+						},
+					},
+				};
+			}
+		},
+		resumeAgentRun: jest.fn(),
+		TerminalOutcomeStorage: class {
+			constructor(_memory: unknown) {}
+		},
 	};
 });
 jest.mock('@mastra/core/agent', () => ({}));
@ -33,6 +124,8 @@ jest.mock('@mastra/memory', () => ({
 jest.mock('@mastra/core/workflows', () => ({}));

 import type { User } from '@n8n/db';
+import type { InstanceAiAgentNode, InstanceAiEvent } from '@n8n/api-types';
+import { resumeAgentRun, type TerminalOutcome } from '@n8n/instance-ai';

 import { InstanceAiService } from '../instance-ai.service';

@ -146,6 +239,224 @@ function createTemporaryCleanupService({

 const fakeUser = { id: 'user-1' } as User;

+type TerminalOutcomeServiceInternals = {
+	replayUndeliveredTerminalOutcomes: (
+		threadId: string,
+		options?: { delivery?: 'snapshot' | 'event' },
+	) => Promise<void>;
+	createTerminalOutcomeStorage: jest.Mock;
+	dbSnapshotStorage: {
+		getLatest: jest.Mock;
+		save: jest.Mock;
+		updateLast: jest.Mock;
+	};
+	eventBus: {
+		getEventsForRun: jest.Mock;
+		publish: jest.Mock;
+	};
+	telemetry: { track: jest.Mock };
+	logger: { warn: jest.Mock };
+	pendingTerminalOutcomes: Map<string, TerminalOutcome>;
+};
+
+function createTerminalOutcomeService(
+	outcomes: TerminalOutcome[],
+	snapshotTree?: InstanceAiAgentNode,
+): TerminalOutcomeServiceInternals {
+	const storage = {
+		getUndelivered: jest.fn(async () => outcomes),
+		markDelivered: jest.fn(async () => {}),
+	};
+	const service = Object.create(InstanceAiService.prototype) as TerminalOutcomeServiceInternals;
+	service.createTerminalOutcomeStorage = jest.fn(() => storage);
+	service.dbSnapshotStorage = {
+		getLatest: jest.fn(async () =>
+			snapshotTree
+				? {
+						tree: snapshotTree,
+						runId: 'run-1',
+						messageGroupId: 'group-1',
+						runIds: ['run-1'],
+					}
+				: undefined,
+		),
+		save: jest.fn(async () => {}),
+		updateLast: jest.fn(async () => {}),
+	};
+	service.eventBus = {
+		getEventsForRun: jest.fn(() => []),
+		publish: jest.fn(),
+	};
+	service.telemetry = { track: jest.fn() };
+	service.logger = { warn: jest.fn() };
+	service.pendingTerminalOutcomes = new Map();
+	return service;
+}
+
+type TerminalGuardOrderServiceInternals = {
+	evaluateTerminalResponse: (
+		threadId: string,
+		runId: string,
+		status: 'completed' | 'cancelled' | 'errored',
+		options?: { messageGroupId?: string; errorMessage?: string },
+	) => { action: string; reason: string } | undefined;
+	evaluateWaitingResponse: (
+		threadId: string,
+		runId: string,
+		confirmationEvent: Extract<InstanceAiEvent, { type: 'confirmation-request' }> | undefined,
+		options?: { messageGroupId?: string },
+	) => { reason: string } | undefined;
+	finishInvalidConfirmationRun: (args: {
+		threadId: string;
+		runId: string;
+		abortController: AbortController;
+		snapshotStorage: unknown;
+	}) => Promise<{ status: string; reason?: string }>;
+	publishRunFinish: (
+		threadId: string,
+		runId: string,
+		status: 'completed' | 'cancelled' | 'errored',
+	) => void;
+	runState: {
+		getRunIdsForMessageGroup: jest.Mock;
+		cancelThread: jest.Mock;
+		clearActiveRun: jest.Mock;
+		hasSuspendedRun: jest.Mock;
+	};
+	eventBus: {
+		events: InstanceAiEvent[];
+		getEventsForRun: jest.Mock;
+		getEventsForRuns: jest.Mock;
+		publish: jest.Mock;
+	};
+	telemetry: { track: jest.Mock };
+	logger: { warn: jest.Mock; error: jest.Mock };
+	traceContextsByRunId: Map<string, { threadId: string; messageGroupId?: string }>;
+	threadPushRef: Map<string, string>;
+	finalizeRunTracing: jest.Mock;
+	saveAgentTreeSnapshot: jest.Mock;
+	reapAiTemporaryFromRun: jest.Mock;
+	maybeFinalizeRunTraceRoot: jest.Mock;
+	schedulePlannedTasks: jest.Mock;
+	drainPendingCheckpointReentries: jest.Mock;
+	processResumedStream: (
+		agent: unknown,
+		resumeData: unknown,
+		opts: {
+			runId: string;
+			mastraRunId: string;
+			threadId: string;
+			user: User;
+			toolCallId: string;
+			signal: AbortSignal;
+			abortController: AbortController;
+			snapshotStorage: unknown;
+		},
+	) => Promise<void>;
+};
+
+type SnapshotServiceInternals = {
+	saveAgentTreeSnapshot: (
+		threadId: string,
+		runId: string,
+		snapshotStorage: {
+			getLatest: jest.Mock;
+			save: jest.Mock;
+			updateLast: jest.Mock;
+		},
+		isUpdate?: boolean,
+		overrideMessageGroupId?: string,
+	) => Promise<void>;
+	runState: {
+		getMessageGroupId: jest.Mock;
+		getRunIdsForMessageGroup: jest.Mock;
+	};
+	eventBus: {
+		getEventsForRun: jest.Mock;
+		getEventsForRuns: jest.Mock;
+	};
+	traceContextsByRunId: Map<string, { tracing?: { rootRun: { id: string; traceId: string } } }>;
+	logger: { warn: jest.Mock };
+};
+
+function createTerminalGuardOrderService(): TerminalGuardOrderServiceInternals {
+	const events: InstanceAiEvent[] = [];
+	const service = Object.create(
+		InstanceAiService.prototype,
+	) as unknown as TerminalGuardOrderServiceInternals;
+	service.runState = {
+		getRunIdsForMessageGroup: jest.fn(() => ['run-1']),
+		cancelThread: jest.fn(),
+		clearActiveRun: jest.fn(),
+		hasSuspendedRun: jest.fn(() => true),
+	};
+	service.eventBus = {
+		events,
+		getEventsForRun: jest.fn(() => events),
+		getEventsForRuns: jest.fn(() => events),
+		publish: jest.fn((_threadId: string, event: InstanceAiEvent) => {
+			events.push(event);
+		}),
+	};
+	service.telemetry = { track: jest.fn() };
+	service.logger = { warn: jest.fn(), error: jest.fn() };
+	service.traceContextsByRunId = new Map([
+		['run-1', { threadId: 'thread-a', messageGroupId: 'group-1' }],
+	]);
+	service.threadPushRef = new Map();
+	service.finalizeRunTracing = jest.fn(async () => {});
+	service.saveAgentTreeSnapshot = jest.fn(async () => {});
+	service.reapAiTemporaryFromRun = jest.fn(async () => []);
+	service.maybeFinalizeRunTraceRoot = jest.fn(async () => {});
+	service.schedulePlannedTasks = jest.fn(async () => {});
+	service.drainPendingCheckpointReentries = jest.fn(async () => {});
+	return service;
+}
+
+function createSnapshotService(): SnapshotServiceInternals {
+	const service = Object.create(InstanceAiService.prototype) as unknown as SnapshotServiceInternals;
+	service.runState = {
+		getMessageGroupId: jest.fn(() => undefined),
+		getRunIdsForMessageGroup: jest.fn(() => []),
+	};
+	service.eventBus = {
+		getEventsForRun: jest.fn(() => []),
+		getEventsForRuns: jest.fn(() => []),
+	};
+	service.traceContextsByRunId = new Map();
+	service.logger = { warn: jest.fn() };
+	return service;
+}
+
+function makeTerminalOutcome(overrides: Partial<TerminalOutcome> = {}): TerminalOutcome {
+	return {
+		id: 'group-1:task-1:completed',
+		threadId: 'thread-a',
+		runId: 'run-1',
+		messageGroupId: 'group-1',
+		correlationId: 'message-1',
+		taskId: 'task-1',
+		agentId: 'agent-builder',
+		status: 'completed',
+		userFacingMessage: 'The background workflow-builder task finished.',
+		createdAt: '2026-05-01T00:00:00.000Z',
+		...overrides,
+	};
+}
+
+function makeAgentTree(): InstanceAiAgentNode {
+	return {
+		agentId: 'agent-001',
+		role: 'orchestrator',
+		status: 'completed',
+		textContent: 'Initial response',
+		reasoning: '',
+		toolCalls: [],
+		children: [],
+		timeline: [{ type: 'text', content: 'Initial response' }],
+	};
+}
+
 describe('InstanceAiService — pending checkpoint re-entry', () => {
 	describe('queuePendingCheckpointReentry', () => {
 		it('records a marker keyed by threadId + checkpointTaskId', () => {
@ -251,6 +562,288 @@ describe('InstanceAiService — pending checkpoint re-entry', () => {
 	});
 });

+describe('InstanceAiService — terminal outcome replay', () => {
+	it('replays undelivered background outcomes into the persisted agent tree', async () => {
+		const outcome = makeTerminalOutcome();
+		const service = createTerminalOutcomeService([outcome], makeAgentTree());
+
+		await service.replayUndeliveredTerminalOutcomes('thread-a');
+
+		expect(service.dbSnapshotStorage.updateLast).toHaveBeenCalledTimes(1);
+		const updatedTree = service.dbSnapshotStorage.updateLast.mock
+			.calls[0][1] as InstanceAiAgentNode;
+		expect(updatedTree.textContent).toContain(outcome.userFacingMessage);
+		expect(updatedTree.timeline).toContainEqual({
+			type: 'text',
+			content: outcome.userFacingMessage,
+			responseId: `background-outcome:${outcome.id}`,
+		});
+		expect(service.createTerminalOutcomeStorage().markDelivered).toHaveBeenCalledWith(
+			'thread-a',
+			outcome.id,
+			expect.any(String),
+		);
+		expect(service.eventBus.publish).not.toHaveBeenCalled();
+	});
+
+	it('publishes recovered background outcomes when replaying for SSE delivery', async () => {
+		const outcome = makeTerminalOutcome();
+		const service = createTerminalOutcomeService([outcome], makeAgentTree());
+
+		await service.replayUndeliveredTerminalOutcomes('thread-a', { delivery: 'event' });
+
+		expect(service.dbSnapshotStorage.updateLast).toHaveBeenCalledTimes(1);
+		expect(service.eventBus.publish).toHaveBeenCalledWith('thread-a', {
+			type: 'text-delta',
+			runId: outcome.runId,
+			agentId: 'agent-001',
+			responseId: `background-outcome:${outcome.id}`,
+			payload: { text: outcome.userFacingMessage },
+		});
+		expect(service.createTerminalOutcomeStorage().markDelivered).toHaveBeenCalledWith(
+			'thread-a',
+			outcome.id,
+			expect.any(String),
+		);
+	});
+
+	it('deduplicates replay by response id only', async () => {
+		const outcome = makeTerminalOutcome({ id: 'group-1:task-2:completed' });
+		const tree = makeAgentTree();
+		tree.textContent = `${tree.textContent}\n\n${outcome.userFacingMessage}`;
+		tree.timeline.push({
+			type: 'text',
+			content: outcome.userFacingMessage,
+			responseId: 'background-outcome:different-id',
+		});
+		const service = createTerminalOutcomeService([outcome], tree);
+
+		await service.replayUndeliveredTerminalOutcomes('thread-a');
+
+		const updatedTree = service.dbSnapshotStorage.updateLast.mock
+			.calls[0][1] as InstanceAiAgentNode;
+		expect(
+			updatedTree.timeline.filter(
+				(entry) => entry.type === 'text' && entry.content === outcome.userFacingMessage,
+			),
+		).toHaveLength(2);
+		expect(updatedTree.timeline).toContainEqual({
+			type: 'text',
+			content: outcome.userFacingMessage,
+			responseId: `background-outcome:${outcome.id}`,
+		});
+	});
+
+	it('creates a snapshot when replay has no prior agent tree', async () => {
+		const outcome = makeTerminalOutcome({ status: 'failed' });
+		const service = createTerminalOutcomeService([outcome]);
+
+		await service.replayUndeliveredTerminalOutcomes('thread-a');
+
+		expect(service.dbSnapshotStorage.save).toHaveBeenCalledTimes(1);
+		const savedTree = service.dbSnapshotStorage.save.mock.calls[0][1] as InstanceAiAgentNode;
+		expect(savedTree.status).toBe('error');
+		expect(savedTree.textContent).toBe(outcome.userFacingMessage);
+		expect(service.createTerminalOutcomeStorage().markDelivered).toHaveBeenCalledWith(
+			'thread-a',
+			outcome.id,
+			expect.any(String),
+		);
+	});
+
+	it('publishes the deterministic line when snapshot replay fails', async () => {
+		const outcome = makeTerminalOutcome();
+		const service = createTerminalOutcomeService([outcome], makeAgentTree());
+		service.dbSnapshotStorage.updateLast.mockRejectedValue(new Error('storage unavailable'));
+
+		await service.replayUndeliveredTerminalOutcomes('thread-a', { delivery: 'event' });
+
+		expect(service.eventBus.publish).toHaveBeenCalledWith('thread-a', {
+			type: 'text-delta',
+			runId: outcome.runId,
+			agentId: 'agent-001',
+			responseId: `background-outcome:${outcome.id}`,
+			payload: { text: outcome.userFacingMessage },
+		});
+		expect(service.createTerminalOutcomeStorage().markDelivered).not.toHaveBeenCalled();
+	});
+
+	it('checks persisted outcomes on repeated replay calls', async () => {
+		const service = createTerminalOutcomeService([]);
+		const storage = service.createTerminalOutcomeStorage();
+		service.createTerminalOutcomeStorage.mockClear();
+
+		await service.replayUndeliveredTerminalOutcomes('thread-a');
+		await service.replayUndeliveredTerminalOutcomes('thread-a');
+
+		expect(service.createTerminalOutcomeStorage).toHaveBeenCalledTimes(2);
+		expect(storage.getUndelivered).toHaveBeenCalledTimes(2);
+	});
+});
+
+describe('InstanceAiService — agent tree snapshots', () => {
+	it('falls back to persisted run ids when an old background group mapping was pruned', async () => {
+		const service = createSnapshotService();
+		const terminalEvent: InstanceAiEvent = {
+			type: 'text-delta',
+			runId: 'run-background',
+			agentId: 'agent-001',
+			payload: { text: 'background finished' },
+		};
+		const snapshotStorage = {
+			getLatest: jest.fn(async () => ({
+				tree: makeAgentTree(),
+				runId: 'run-original',
+				messageGroupId: 'group-old',
+				runIds: ['run-original', 'run-background'],
+			})),
+			save: jest.fn(async () => {}),
+			updateLast: jest.fn(async () => {}),
+		};
+		service.eventBus.getEventsForRuns.mockReturnValue([terminalEvent]);
+
+		await service.saveAgentTreeSnapshot(
+			'thread-a',
+			'run-background',
+			snapshotStorage,
+			true,
+			'group-old',
+		);
+
+		expect(service.runState.getRunIdsForMessageGroup).toHaveBeenCalledWith('group-old');
+		expect(snapshotStorage.getLatest).toHaveBeenCalledWith('thread-a', {
+			messageGroupId: 'group-old',
+			runId: 'run-background',
+		});
+		expect(service.eventBus.getEventsForRuns).toHaveBeenCalledWith('thread-a', [
+			'run-original',
+			'run-background',
+		]);
+		expect(snapshotStorage.updateLast).toHaveBeenCalledWith(
+			'thread-a',
+			expect.objectContaining({ textContent: 'background finished' }),
+			'run-background',
+			expect.objectContaining({
+				messageGroupId: 'group-old',
+				runIds: ['run-original', 'run-background'],
+			}),
+		);
+		expect(snapshotStorage.save).not.toHaveBeenCalled();
+	});
+
+	it('skips update snapshots when no events are available for a pruned group', async () => {
+		const service = createSnapshotService();
+		const snapshotStorage = {
+			getLatest: jest.fn(async () => ({
+				tree: makeAgentTree(),
+				runId: 'run-original',
+				messageGroupId: 'group-old',
+				runIds: ['run-background'],
+			})),
+			save: jest.fn(async () => {}),
+			updateLast: jest.fn(async () => {}),
+		};
+
+		await service.saveAgentTreeSnapshot(
+			'thread-a',
+			'run-background',
+			snapshotStorage,
+			true,
+			'group-old',
+		);
+
+		expect(snapshotStorage.updateLast).not.toHaveBeenCalled();
+		expect(snapshotStorage.save).not.toHaveBeenCalled();
+		expect(service.logger.warn).toHaveBeenCalledWith(
+			'Skipped updating empty Instance AI agent tree snapshot',
+			expect.objectContaining({
+				threadId: 'thread-a',
+				runId: 'run-background',
+				messageGroupId: 'group-old',
+			}),
+		);
+	});
+});
+
+describe('InstanceAiService — terminal response guard wiring', () => {
+	it('publishes fallback output before run-finish on a silent completed run', () => {
+		const service = createTerminalGuardOrderService();
+
+		service.evaluateTerminalResponse('thread-a', 'run-1', 'completed', {
+			messageGroupId: 'group-1',
+		});
+		service.publishRunFinish('thread-a', 'run-1', 'completed');
+
+		expect(service.eventBus.events.map((event) => event.type)).toEqual([
+			'text-delta',
+			'run-finish',
+		]);
+	});
+
+	it('publishes fallback error before run-finish on a silent failed run', () => {
+		const service = createTerminalGuardOrderService();
+
+		service.evaluateTerminalResponse('thread-a', 'run-1', 'errored', {
+			messageGroupId: 'group-1',
+			errorMessage: 'Safe user-facing error',
+		});
+		service.publishRunFinish('thread-a', 'run-1', 'errored');
+
+		expect(service.eventBus.events.map((event) => event.type)).toEqual(['error', 'run-finish']);
+	});
+
+	it('clears malformed confirmation suspension and finishes the run after the guard error', async () => {
+		const service = createTerminalGuardOrderService();
+		const abortController = new AbortController();
+
+		const decision = service.evaluateWaitingResponse('thread-a', 'run-1', undefined, {
+			messageGroupId: 'group-1',
+		});
+		if (decision?.reason === 'confirmation-invalid') {
+			await service.finishInvalidConfirmationRun({
+				threadId: 'thread-a',
+				runId: 'run-1',
+				abortController,
+				snapshotStorage: {},
+			});
+		}
+
+		expect(decision?.reason).toBe('confirmation-invalid');
+		expect(service.runState.cancelThread).toHaveBeenCalledWith('thread-a');
+		expect(abortController.signal.aborted).toBe(true);
+		expect(service.saveAgentTreeSnapshot).toHaveBeenCalledWith('thread-a', 'run-1', {});
+		expect(service.eventBus.events.map((event) => event.type)).toEqual(['error', 'run-finish']);
+		expect(service.eventBus.events.at(-1)).toMatchObject({
+			type: 'run-finish',
+			payload: { status: 'error' },
+		});
+	});
+
+	it('persists the resumed-run fallback error before cleanup', async () => {
+		const service = createTerminalGuardOrderService();
+		const abortController = new AbortController();
+		jest.mocked(resumeAgentRun).mockRejectedValueOnce(new Error('provider failed'));
+
+		await service.processResumedStream(
+			{},
+			{},
+			{
+				runId: 'run-1',
+				mastraRunId: 'mastra-1',
+				threadId: 'thread-a',
+				user: fakeUser,
+				toolCallId: 'tool-call-1',
+				signal: abortController.signal,
+				abortController,
+				snapshotStorage: {},
+			},
+		);
+
+		expect(service.eventBus.events.map((event) => event.type)).toEqual(['error', 'run-finish']);
+		expect(service.saveAgentTreeSnapshot).toHaveBeenCalledWith('thread-a', 'run-1', {});
+	});
+});
+
 describe('InstanceAiService — AI temporary workflow cleanup', () => {
 	it('defers cleanup while background tasks are running', async () => {
 		const { service, archiveIfAiTemporary } = createTemporaryCleanupService({
--- a/packages/cli/src/modules/instance-ai/tests/message-parser.test.ts
+++ b/packages/cli/src/modules/instance-ai/tests/message-parser.test.ts
@ -7,6 +7,19 @@ function makeDate(offset = 0): Date {
 	return new Date(Date.now() + offset);
 }

+function makeSnapshotTree(text = 'Snapshot text'): InstanceAiAgentNode {
+	return {
+		agentId: 'agent-001',
+		role: 'orchestrator',
+		status: 'completed',
+		textContent: text,
+		reasoning: '',
+		toolCalls: [],
+		children: [],
+		timeline: [{ type: 'text', content: text }],
+	};
+}
+
 describe('parseStoredMessages', () => {
 	describe('user messages', () => {
 		it('should parse user message with string content', () => {
@ -274,6 +287,140 @@ describe('parseStoredMessages', () => {
 			expect(result[1].runId).toBe('run_abc123');
 		});

+		it('should hydrate orphan snapshots without a matching assistant message', () => {
+			const snapshotCreatedAt = makeDate(1);
+			const tree = makeSnapshotTree('I finished the run, but I did not generate a final response.');
+			const messages: MastraDBMessage[] = [
+				{
+					id: 'msg-u',
+					role: 'user',
+					content: 'Build something',
+					createdAt: makeDate(),
+				},
+			];
+
+			const result = parseStoredMessages(messages, [
+				{
+					tree,
+					runId: 'run_silent',
+					messageGroupId: 'mg_silent',
+					runIds: ['run_silent'],
+					createdAt: snapshotCreatedAt,
+					updatedAt: snapshotCreatedAt,
+				},
+			]);
+
+			expect(result).toHaveLength(2);
+			expect(result[1]).toMatchObject({
+				id: 'mg_silent',
+				role: 'assistant',
+				runId: 'run_silent',
+				messageGroupId: 'mg_silent',
+				content: tree.textContent,
+				createdAt: snapshotCreatedAt.toISOString(),
+				agentTree: tree,
+			});
+		});
+
+		it('should append trailing orphan snapshots without remapping existing assistant snapshots', () => {
+			const firstTree = makeSnapshotTree('First assistant response');
+			const orphanTree = makeSnapshotTree('The run was cancelled before I could send a response.');
+			const messages: MastraDBMessage[] = [
+				{
+					id: 'msg-u1',
+					role: 'user',
+					content: 'First request',
+					createdAt: makeDate(),
+				},
+				{
+					id: 'msg-a1',
+					role: 'assistant',
+					content: { format: 2, content: 'First assistant response' },
+					createdAt: makeDate(1),
+				},
+				{
+					id: 'msg-u2',
+					role: 'user',
+					content: 'Cancel the next run',
+					createdAt: makeDate(2),
+				},
+			];
+
+			const result = parseStoredMessages(messages, [
+				{ tree: firstTree, runId: 'run_first', messageGroupId: 'mg_first' },
+				{ tree: orphanTree, runId: 'run_cancelled', messageGroupId: 'mg_cancelled' },
+			]);
+
+			expect(result).toHaveLength(4);
+			expect(result[1].runId).toBe('run_first');
+			expect(result[1].agentTree).toBe(firstTree);
+			expect(result[3]).toMatchObject({
+				role: 'assistant',
+				runId: 'run_cancelled',
+				messageGroupId: 'mg_cancelled',
+				content: orphanTree.textContent,
+				agentTree: orphanTree,
+			});
+		});
+
+		it('should place leading orphan snapshots before later assistant messages', () => {
+			const orphanTree = makeSnapshotTree('The run was cancelled before I could send a response.');
+			const secondTree = makeSnapshotTree('Second assistant response');
+			const messages: MastraDBMessage[] = [
+				{
+					id: 'msg-u1',
+					role: 'user',
+					content: 'Cancel this run',
+					createdAt: makeDate(),
+				},
+				{
+					id: 'msg-u2',
+					role: 'user',
+					content: 'Now answer normally',
+					createdAt: makeDate(2),
+				},
+				{
+					id: 'msg-a2',
+					role: 'assistant',
+					content: { format: 2, content: 'Second assistant response' },
+					createdAt: makeDate(3),
+				},
+			];
+
+			const result = parseStoredMessages(messages, [
+				{
+					tree: orphanTree,
+					runId: 'run_cancelled',
+					messageGroupId: 'mg_cancelled',
+					createdAt: makeDate(1),
+					updatedAt: makeDate(1),
+				},
+				{
+					tree: secondTree,
+					runId: 'run_second',
+					messageGroupId: 'mg_second',
+					createdAt: makeDate(4),
+					updatedAt: makeDate(4),
+				},
+			]);
+
+			expect(result).toHaveLength(4);
+			expect(result[1]).toMatchObject({
+				role: 'assistant',
+				runId: 'run_cancelled',
+				messageGroupId: 'mg_cancelled',
+				content: orphanTree.textContent,
+				agentTree: orphanTree,
+			});
+			expect(result[3]).toMatchObject({
+				role: 'assistant',
+				runId: 'run_second',
+				messageGroupId: 'mg_second',
+				content: 'Second assistant response',
+				agentTree: secondTree,
+			});
+		});
+
 		it('should apply renderHint correctly for known tool names', () => {
 			const messages: MastraDBMessage[] = [
 				{
--- a/packages/cli/src/modules/instance-ai/eval/mock-handler.ts
+++ b/packages/cli/src/modules/instance-ai/eval/mock-handler.ts
@ -24,9 +24,8 @@ import { Logger } from '@n8n/backend-common';
 import { Container } from '@n8n/di';
 import type { EvalLlmMockHandler, EvalMockHttpResponse } from 'n8n-core';
 import { jsonParse } from 'n8n-workflow';
-import { z } from 'zod';

-import { createEvalAgent, extractText, Tool } from '@n8n/instance-ai';
+import { createEvalAgent, extractText } from '@n8n/instance-ai';
 import { fetchApiDocs } from './api-docs';
 import { extractNodeConfig } from './node-config';
 import { redactSecretKeys, truncateForLlm } from './request-sanitizer';
@ -35,38 +34,28 @@ import { redactSecretKeys, truncateForLlm } from './request-sanitizer';
 // System prompt
 // ---------------------------------------------------------------------------

-const MOCK_SYSTEM_PROMPT = `You are an API mock server generating realistic HTTP responses for n8n workflow evaluation.
+const MOCK_SYSTEM_PROMPT = `You generate realistic HTTP responses for one specific request, mocking an API in n8n workflow evaluation.

-## Your tools
+You get everything you need in the user message: the request (service, method, URL, body, query), API docs for the endpoint, the n8n node's parameters, and optional context (globalContext, nodeHint, scenarioHints). Generate the response directly — do NOT call any tools.

-You have two tools. Call them before generating your response:
+Response SHAPE comes from the API docs; DATA VALUES come from the node config. Use names/IDs from the config exactly (case-sensitive).

-"lookup_api_docs" — Fetches real API documentation for a service endpoint. Use this to learn the correct response STRUCTURE (what fields, what nesting, what types the real API returns). Pay special attention to what the real API returns for the exact HTTP method and URL path you're responding to.
+Node-config patterns to know:
+  - "__rl" object: "value" is the selected resource id
+  - "schema" array: each entry's "id" is the response field name (NOT "displayName"). e.g. {id:"timestamp",displayName:"Timestamp"} → response uses "timestamp"
+  - Strings starting with "=" are expressions (ignore)

-"get_node_config" — Returns the n8n node's configuration parameters. This tells you what the node is set up to work with. The configuration contains the values the node expects to find in API responses — resource IDs, field names, column names, etc. Every node type has different parameters, so you need to interpret the config intelligently. Key patterns:
-  - Objects with "__rl" are resource selectors — "value" is the selected resource (a document ID, channel, project, etc.)
-  - "schema" arrays list the columns/fields the node expects. CRITICAL: use the "id" field as the exact column/field name in your response — NOT "displayName". For example, if schema has {"id": "timestamp", "displayName": "Timestamp"}, the API response must use "timestamp" (lowercase), not "Timestamp"
-  - "operation" and "resource" describe what the node does (e.g. "send" a "message", "create" an "issue")
-  - Strings starting with "=" are expressions (ignore these) — all other strings are literal values
-
-## How to combine them
-
-The API docs tell you the response SHAPE. The node config tells you the exact DATA VALUES to put in that shape. All names, IDs, and identifiers from the node config are case-sensitive — use them character-for-character.
+Match THIS request only (URL + method): a node may make multiple sequential calls; reply to the specific one shown. Echo identifiers, placeholders, and reference values from the request back into the response. No pagination — always indicate end of results.

 ## Output format

-Respond with ONLY a JSON object. No explanation, no markdown, no prose.
+Return ONLY a JSON object, no prose, no markdown:

-{ "type": "json", "body": { ...realistic API response... } }
+{ "type": "json", "body": { ... } }
 { "type": "binary", "contentType": "application/pdf", "filename": "doc.pdf" }
-{ "type": "error", "statusCode": 404, "body": { ...service error format... } }
+{ "type": "error", "statusCode": 404, "body": { ... } }

-## Rules
-
- A node may make MULTIPLE sequential HTTP requests in a single execution (e.g., first GET metadata, then GET headers, then POST data). You are responding to ONE specific request. Match your response to the URL + method of THIS request only. A GET to a metadata endpoint must return metadata — not a write result — even if the node's overall purpose is to write data.
- Echo request values faithfully. If the request contains an identifier, name, or reference value (even one that looks like a placeholder such as "YOUR_CHAT_ID" or "YOUR_API_KEY"), echo it back exactly in the corresponding response field. The real API would reflect the same value the client sent.
- Some APIs return empty or minimal responses on success (204 with no body, 202 with empty body). If the API documentation indicates an empty response body, return { "type": "json", "body": {} }. Don't invent additional response fields.
- No pagination — always indicate end of results (has_more=false, nextPageToken=null, etc.)`;
+For APIs that return empty responses on success (204/202), use { "type": "json", "body": {} }.`;

 // ---------------------------------------------------------------------------
 // Types
@ -106,13 +95,13 @@ interface MockResponseSpec {
 * response spec, then materializes it into the correct format (JSON, Buffer, error).
 */
 export function createLlmMockHandler(options?: MockHandlerOptions): EvalLlmMockHandler {
-	// Pre-compute node configs so we don't re-extract on every request
 	const nodeConfigCache = new Map<string, string>();

 	return async (requestOptions, node) => {
 		if (!nodeConfigCache.has(node.name)) {
 			nodeConfigCache.set(node.name, extractNodeConfig(node));
 		}
+
 		return await generateMockResponse(requestOptions, node, {
 			scenarioHints: options?.scenarioHints,
 			globalContext: options?.globalContext,
@ -177,6 +166,12 @@ async function generateMockResponse(
 		);
 	}

+	const apiDocs = await fetchApiDocs(
+		serviceName,
+		`${request.method ?? 'GET'} ${endpoint} response format`,
+	);
+	sections.push('', '## API documentation', apiDocs);
+
 	if (context.nodeConfig) {
 		sections.push('', '## Node Configuration', context.nodeConfig);
 	}
@ -202,7 +197,7 @@ async function generateMockResponse(

 	for (let attempt = 0; attempt <= context.maxRetries; attempt++) {
 		try {
-			const spec = await callLlm(userPrompt, context.nodeConfig);
+			const spec = await callLlm(userPrompt);
 			return materializeSpec(spec);
 		} catch (error) {
 			lastError = error instanceof Error ? error.message : String(error);
@ -224,49 +219,10 @@ async function generateMockResponse(
 	};
 }

-// ---------------------------------------------------------------------------
-// Tool definitions (@n8n/agents)
-// ---------------------------------------------------------------------------
-
-const apiDocsTool = new Tool('lookup_api_docs')
-	.description(
-		'Look up official API documentation for a specific REST endpoint to understand the exact response format.',
-	)
-	.input(
-		z.object({
-			serviceName: z
-				.string()
-				.describe('The API service name (e.g. "Google Sheets", "Gmail", "Slack")'),
-			endpointDescription: z
-				.string()
-				.describe('Description of the endpoint (e.g. "GET spreadsheets values response format")'),
-		}),
-	)
-	.handler(async (input: { serviceName: string; endpointDescription: string }) => {
-		return await fetchApiDocs(input.serviceName, input.endpointDescription);
-	})
-	.build();
-
-function createNodeConfigTool(nodeConfig: string) {
-	return new Tool('get_node_config')
-		.description(
-			"Get the n8n node's configuration parameters — resource IDs, field names, settings, etc. Your mock data must match these exact values.",
-		)
-		.input(z.object({}))
-		.handler(async () => nodeConfig)
-		.build();
-}
-
-// ---------------------------------------------------------------------------
-// LLM call with tool use (agent handles multi-round loop automatically)
-// ---------------------------------------------------------------------------
-
-async function callLlm(userPrompt: string, nodeConfig: string): Promise<MockResponseSpec> {
+async function callLlm(userPrompt: string): Promise<MockResponseSpec> {
 	const agent = createEvalAgent('eval-mock-responder', {
 		instructions: MOCK_SYSTEM_PROMPT,
-	})
-		.tool(apiDocsTool)
-		.tool(createNodeConfigTool(nodeConfig));
+	});

 	const result = await agent.generate(userPrompt);

--- a/packages/cli/src/modules/instance-ai/instance-ai-settings.service.ts
+++ b/packages/cli/src/modules/instance-ai/instance-ai-settings.service.ts
@ -19,6 +19,7 @@ import { jsonParse } from 'n8n-workflow';
 import { CredentialsFinderService } from '@/credentials/credentials-finder.service';
 import { CredentialsService } from '@/credentials/credentials.service';
 import { UnprocessableRequestError } from '@/errors/response-errors/unprocessable.error';
+import { EventService } from '@/events/event.service';
 import { AiService } from '@/services/ai.service';
 import { UserService } from '@/services/user.service';

@ -109,6 +110,7 @@ export class InstanceAiSettingsService {
 		private readonly aiService: AiService,
 		private readonly credentialsService: CredentialsService,
 		private readonly credentialsFinderService: CredentialsFinderService,
+		private readonly eventService: EventService,
 	) {
 		this.config = globalConfig.instanceAi;
 		this.deploymentConfig = globalConfig.deployment;
@ -177,6 +179,8 @@ export class InstanceAiSettingsService {
 			);
 		}
 		const c = this.config;
+		const previousMcpServers = c.mcpServers;
+		const previousBrowserMcp = c.browserMcp;
 		if (update.enabled !== undefined) this.enabled = update.enabled;
 		if (update.lastMessages !== undefined) c.lastMessages = update.lastMessages;
 		if (update.embedderModel !== undefined) c.embedderModel = update.embedderModel;
@ -202,6 +206,12 @@ export class InstanceAiSettingsService {
 		if (update.optinModalDismissed !== undefined)
 			this.optinModalDismissed = update.optinModalDismissed;
 		await this.persistAdminSettings();
+
+		this.eventService.emit('instance-ai-settings-updated', {
+			mcpSettingsChanged:
+				c.mcpServers !== previousMcpServers || c.browserMcp !== previousBrowserMcp,
+		});
+
 		return this.getAdminSettings();
 	}

--- a/packages/cli/src/modules/instance-ai/instance-ai.adapter.service.ts
+++ b/packages/cli/src/modules/instance-ai/instance-ai.adapter.service.ts
@ -290,12 +290,14 @@ export class InstanceAiAdapterService {

 		return {
 			async list(options) {
+				const filter = {
+					...(options?.status === 'all' ? {} : { isArchived: options?.status === 'archived' }),
+					...(options?.query ? { query: options.query } : {}),
+				};
+
 				const { workflows } = await workflowService.getMany(user, {
 					take: options?.limit ?? 50,
-					filter: {
-						isArchived: false,
-						...(options?.query ? { query: options.query } : {}),
-					},
+					filter,
 				});

 				return workflows
@ -306,6 +308,7 @@ export class InstanceAiAdapterService {
 							name: wf.name,
 							versionId: wf.versionId,
 							activeVersionId: wf.activeVersionId ?? null,
+							isArchived: wf.isArchived,
 							createdAt: wf.createdAt.toISOString(),
 							updatedAt: wf.updatedAt.toISOString(),
 						}),
@ -326,12 +329,18 @@ export class InstanceAiAdapterService {

 			async archive(workflowId: string) {
 				assertNotReadOnly();
-				await workflowService.archive(user, workflowId, { skipArchived: true });
+				const result = await workflowService.archive(user, workflowId, { skipArchived: true });
+				if (!result) {
+					throw new Error(`Workflow ${workflowId} not found or not accessible`);
+				}
 			},

-			async delete(workflowId: string) {
+			async unarchive(workflowId: string) {
 				assertNotReadOnly();
-				await workflowService.delete(user, workflowId);
+				const result = await workflowService.unarchive(user, workflowId);
+				if (!result) {
+					throw new Error(`Workflow ${workflowId} not found or not accessible`);
+				}
 			},

 			async clearAiTemporary(workflowId: string) {
@ -381,6 +390,7 @@ export class InstanceAiAdapterService {
 				if (threadId) {
 					telemetry.track('Builder published workflow', {
 						thread_id: threadId,
+						workflow_id: workflowId,
 						executed_by: 'ai',
 					});
 				}
@ -807,6 +817,19 @@ export class InstanceAiAdapterService {
 					runData.pinData = basePinData;
 				}

+				const trackBuilderExecutedWorkflow = (status: ExecutionResult['status']) => {
+					if (!threadId) return;
+
+					telemetry.track('Builder executed workflow', {
+						thread_id: threadId,
+						workflow_id: workflowId,
+						executed_by: 'ai',
+						pinned_node_count: Object.keys(runData.pinData ?? {}).length,
+						exec_type: runData.executionMode,
+						status,
+					});
+				};
+
 				const executionId = await workflowRunner.run(runData);

 				// Wait for completion with timeout protection
@ -838,30 +861,25 @@ export class InstanceAiAdapterService {
 							} catch {
 								// Execution may have completed between timeout and cancel
 							}
-							return {
+							const result = {
 								executionId,
 								status: 'error',
 								error: `Execution timed out after ${timeoutMs}ms and was cancelled`,
 							} satisfies ExecutionResult;
+							trackBuilderExecutedWorkflow(result.status);
+							return result;
 						}
 						throw error;
 					}
 				}

-				if (threadId) {
-					telemetry.track('Builder executed workflow', {
-						thread_id: threadId,
-						executed_by: 'ai',
-						pinned_node_count: Object.keys(runData.pinData ?? {}).length,
-						exec_type: runData.executionMode,
-					});
-				}
-
-				return await extractExecutionResult(
+				const result = await extractExecutionResult(
 					executionRepository,
 					executionId,
 					allowSendingParameterValues,
 				);
+				trackBuilderExecutedWorkflow(result.status);
+				return result;
 			},

 			async getStatus(executionId: string) {
@ -2972,6 +2990,7 @@ function toWorkflowDetail(
 		name: workflow.name,
 		versionId: workflow.versionId,
 		activeVersionId: workflow.activeVersionId ?? null,
+		isArchived: workflow.isArchived,
 		createdAt: workflow.createdAt.toISOString(),
 		updatedAt: workflow.updatedAt.toISOString(),
 		nodes: (workflow.nodes ?? []).map(
--- a/packages/cli/src/modules/instance-ai/instance-ai.controller.ts
+++ b/packages/cli/src/modules/instance-ai/instance-ai.controller.ts
@ -171,6 +171,11 @@ export class InstanceAiController {
 		if (ownership === 'other_user') {
 			throw new ForbiddenError('Not authorized for this thread');
 		}
+		if (ownership === 'owned') {
+			await this.instanceAiService.replayUndeliveredTerminalOutcomes(threadId, {
+				delivery: 'event',
+			});
+		}

 		// When the thread didn't exist at connect time, another user could create
 		// and own it before events start flowing. We re-check once on the first
@ -547,6 +552,7 @@ export class InstanceAiController {
 	) {
 		this.requireInstanceAiEnabled();
 		await this.assertThreadAccess(req.user.id, threadId);
+		await this.instanceAiService.replayUndeliveredTerminalOutcomes(threadId);

 		// ?raw=true returns the old format for the thread inspector
 		if (query.raw === 'true') {
--- a/packages/cli/src/modules/instance-ai/instance-ai.service.ts
+++ b/packages/cli/src/modules/instance-ai/instance-ai.service.ts
@ -3,6 +3,7 @@ import {
 	applyBranchReadOnlyOverrides,
 	buildProxyHeaders,
 	type InstanceAiAttachment,
+	type InstanceAiAgentNode,
 	type InstanceAiConfirmRequest,
 	type InstanceAiEvent,
 	type InstanceAiThreadStatusResponse,
@ -12,10 +13,12 @@ import {
 	type TaskList,
 } from '@n8n/api-types';
 import { Logger } from '@n8n/backend-common';
-import { GlobalConfig } from '@n8n/config';
+import { GlobalConfig, SsrfProtectionConfig } from '@n8n/config';
 import { ErrorReporter } from 'n8n-core';
 import { Time } from '@n8n/constants';
 import type { InstanceAiConfig } from '@n8n/config';
+
+import { SsrfProtectionService } from '@/services/ssrf/ssrf-protection.service';
 import { AiBuilderTemporaryWorkflowRepository, UserRepository, type User } from '@n8n/db';
 import { Service } from '@n8n/di';
 import { UrlService } from '@/services/url.service';
@ -37,9 +40,11 @@ import {
 	buildAttachmentManifest,
 	isStructuredAttachment,
 	enrichMessageWithBackgroundTasks,
+	InstanceAiTerminalResponseGuard,
 	MastraTaskStorage,
 	PlannedTaskCoordinator,
 	PlannedTaskStorage,
+	TerminalOutcomeStorage,
 	applyPlannedTaskPermissions,
 	PLANNED_TASK_PERMISSION_OVERRIDES,
 	BuilderSandboxSessionRegistry,
@ -70,15 +75,21 @@ import {
 	type ServiceProxyConfig,
 	type StreamableAgent,
 	type SuspendedRunState,
+	type TerminalOutcome,
+	type TerminalResponseDecision,
+	type TerminalResponseStatus,
+	type WorkSummary,
 	WorkflowTaskCoordinator,
 	WorkflowLoopStorage,
 } from '@n8n/instance-ai';
 import { setSchemaBaseDirs } from '@n8n/workflow-sdk';
 import { nanoid } from 'nanoid';
+import { OperationalError, UnexpectedError, UserError } from 'n8n-workflow';
 import type * as Undici from 'undici';
 import { v5 as uuidv5 } from 'uuid';

 import { N8N_VERSION } from '@/constants';
+import { EventService } from '@/events/event.service';
 import { SourceControlPreferencesService } from '@/modules/source-control.ee/source-control-preferences.service.ee';
 import { AiService } from '@/services/ai.service';
 import { Push } from '@/push';
@ -102,12 +113,81 @@ function getErrorMessage(error: unknown): string {
 	return error instanceof Error ? error.message : String(error);
 }

+const ORCHESTRATOR_AGENT_ID = 'agent-001';
+
+function getUserFacingErrorMessage(error: unknown): string {
+	if (error instanceof UserError) {
+		return error.message;
+	}
+
+	if (error instanceof OperationalError) {
+		return 'I hit an operational error before I could finish that response. Please try again.';
+	}
+
+	if (error instanceof UnexpectedError) {
+		return 'Something went wrong before I could finish that response. Please try again.';
+	}
+
+	return 'Something went wrong before I could finish that response. Please try again.';
+}
+
+function getBackgroundOutcomeResponseId(outcome: TerminalOutcome): string {
+	return `background-outcome:${outcome.id}`;
+}
+
+function createTerminalOutcomeAgentTree(
+	outcome: TerminalOutcome,
+	responseId: string,
+): InstanceAiAgentNode {
+	return {
+		agentId: ORCHESTRATOR_AGENT_ID,
+		role: 'orchestrator',
+		status:
+			outcome.status === 'cancelled'
+				? 'cancelled'
+				: outcome.status === 'failed'
+					? 'error'
+					: 'completed',
+		textContent: outcome.userFacingMessage,
+		reasoning: '',
+		toolCalls: [],
+		children: [],
+		timeline: [{ type: 'text', content: outcome.userFacingMessage, responseId }],
+	};
+}
+
+function appendTerminalOutcomeToAgentTree(
+	tree: InstanceAiAgentNode,
+	outcome: TerminalOutcome,
+	responseId: string,
+): { tree: InstanceAiAgentNode; appended: boolean } {
+	const text = outcome.userFacingMessage.trim();
+	if (!text) return { tree, appended: false };
+
+	const alreadyInTimeline = tree.timeline.some(
+		(entry) => entry.type === 'text' && entry.responseId === responseId,
+	);
+	if (alreadyInTimeline) {
+		return { tree, appended: false };
+	}
+
+	return {
+		appended: true,
+		tree: {
+			...tree,
+			textContent: tree.textContent ? `${tree.textContent}\n\n${outcome.userFacingMessage}` : text,
+			timeline: [
+				...tree.timeline,
+				{ type: 'text', content: outcome.userFacingMessage, responseId },
+			],
+		},
+	};
+}
+
 function createInertAbortSignal(): AbortSignal {
 	return new AbortController().signal;
 }

-const ORCHESTRATOR_AGENT_ID = 'agent-001';
-
 // Stable UUID namespace for deterministic feedback IDs. Submitting the same
 // (key, responseId) pair twice produces the same feedback UUID so LangSmith
 // upserts the record (thumbs-down → later text comment = one record, not two).
@ -248,7 +328,7 @@ function toConfirmationData(request: InstanceAiConfirmRequest): ConfirmationData

@Service()
 export class InstanceAiService {
-	private readonly mcpClientManager = new McpClientManager();
+	private readonly mcpClientManager: McpClientManager;

 	private readonly instanceAiConfig: InstanceAiConfig;

@ -256,6 +336,8 @@ export class InstanceAiService {

 	private readonly webhookBaseUrl: string;

+	private readonly formBaseUrl: string;
+
 	private readonly runState = new RunStateRegistry<User>();

 	private readonly backgroundTasks = new BackgroundTaskManager(
@ -303,6 +385,10 @@ export class InstanceAiService {
 	 */
 	private readonly pendingCheckpointReentries = new Map<string, Set<string>>();

+	private readonly pendingTerminalOutcomes = new Map<string, TerminalOutcome>();
+
+	private terminalOutcomeStorage?: TerminalOutcomeStorage;
+
 	/** Periodic sweep that auto-rejects timed-out HITL confirmations. */
 	private confirmationTimeoutInterval?: NodeJS.Timeout;

@ -336,6 +422,9 @@ export class InstanceAiService {
 		private readonly userRepository: UserRepository,
 		private readonly aiBuilderTemporaryWorkflowRepository: AiBuilderTemporaryWorkflowRepository,
 		private readonly errorReporter: ErrorReporter,
+		ssrfProtectionConfig: SsrfProtectionConfig,
+		ssrfProtectionService: SsrfProtectionService,
+		private readonly eventService: EventService,
 	) {
 		this.logger = logger.scoped('instance-ai');
 		this.instanceAiConfig = globalConfig.instanceAi;
@ -347,6 +436,26 @@ export class InstanceAiService {
 		const restEndpoint = globalConfig.endpoints.rest;
 		this.oauth2CallbackUrl = `${editorBaseUrl.replace(/\/$/, '')}/${restEndpoint}/oauth2-credential/callback`;
 		this.webhookBaseUrl = `${this.urlService.getWebhookBaseUrl()}${globalConfig.endpoints.webhook}`;
+		this.formBaseUrl = `${this.urlService.getWebhookBaseUrl()}${globalConfig.endpoints.form}`;
+
+		this.mcpClientManager = new McpClientManager(
+			ssrfProtectionConfig.enabled ? ssrfProtectionService : undefined,
+		);
+
+		// When the admin changes MCP settings, tear down existing clients so the
+		// next agent run rebuilds them against the new config. In-flight tool
+		// calls on disconnected clients will fail — that's accepted: the
+		// alternative is leaking clients keyed by stale config until shutdown.
+		// We only listen for the MCP-changed flag so unrelated settings saves
+		// don't churn live MCP connections.
+		this.eventService.on('instance-ai-settings-updated', ({ mcpSettingsChanged }) => {
+			if (!mcpSettingsChanged) return;
+			this.mcpClientManager.disconnect().catch((error: unknown) => {
+				this.logger.warn('Failed to disconnect MCP clients after settings change', {
+					error: getErrorMessage(error),
+				});
+			});
+		});

 		this.startConfirmationTimeoutSweep();
 	}
@ -1053,13 +1162,15 @@ export class InstanceAiService {
 				agentId: task.agentId,
 				payload: { role: task.role, result: '', error: 'Cancelled by user' },
 			});
-			void this.saveAgentTreeSnapshot(
-				threadId,
-				task.runId,
-				this.dbSnapshotStorage,
-				true,
-				task.messageGroupId,
-			);
+			void this.recordBackgroundTerminalOutcome(task).finally(() => {
+				void this.saveAgentTreeSnapshot(
+					threadId,
+					task.runId,
+					this.dbSnapshotStorage,
+					true,
+					task.messageGroupId,
+				);
+			});
 			if (user) {
 				void this.handlePlannedTaskSettlement(user, task, 'cancelled');
 			}
@ -1112,13 +1223,15 @@ export class InstanceAiService {
 		// Persist the updated agent tree so cancelled status survives page reload.
 		// The onSettled callback in executeTask is skipped for aborted tasks,
 		// so we must save the snapshot explicitly here.
-		void this.saveAgentTreeSnapshot(
-			threadId,
-			task.runId,
-			this.dbSnapshotStorage,
-			true,
-			task.messageGroupId,
-		);
+		void this.recordBackgroundTerminalOutcome(task).finally(() => {
+			void this.saveAgentTreeSnapshot(
+				threadId,
+				task.runId,
+				this.dbSnapshotStorage,
+				true,
+				task.messageGroupId,
+			);
+		});

 		const user = this.runState.getThreadUser(threadId);
 		if (user) {
@ -1436,6 +1549,372 @@ export class InstanceAiService {
 		return { memory, taskStorage, plannedTaskService };
 	}

+	private evaluateTerminalResponse(
+		threadId: string,
+		runId: string,
+		status: Exclude<TerminalResponseStatus, 'waiting'>,
+		options: {
+			messageGroupId?: string;
+			correlationId?: string;
+			workSummary?: WorkSummary;
+			errorMessage?: string;
+		} = {},
+	): TerminalResponseDecision | undefined {
+		const guard = new InstanceAiTerminalResponseGuard({
+			runId,
+			rootAgentId: ORCHESTRATOR_AGENT_ID,
+			messageGroupId: options.messageGroupId,
+			correlationId: options.correlationId,
+		});
+		const decision = guard.evaluateTerminal(
+			this.getTerminalGuardEvents(threadId, runId, options.messageGroupId),
+			status,
+			{
+				workSummary: options.workSummary,
+				errorMessage: options.errorMessage,
+			},
+		);
+		this.handleTerminalResponseDecision(threadId, runId, decision, options.messageGroupId);
+		return decision;
+	}
+
+	private evaluateWaitingResponse(
+		threadId: string,
+		runId: string,
+		confirmationEvent: Extract<InstanceAiEvent, { type: 'confirmation-request' }> | undefined,
+		options: { messageGroupId?: string; correlationId?: string } = {},
+	): TerminalResponseDecision | undefined {
+		const guard = new InstanceAiTerminalResponseGuard({
+			runId,
+			rootAgentId: ORCHESTRATOR_AGENT_ID,
+			messageGroupId: options.messageGroupId,
+			correlationId: options.correlationId,
+		});
+		const decision = guard.evaluateWaiting(
+			this.getTerminalGuardEvents(threadId, runId, options.messageGroupId),
+			confirmationEvent,
+		);
+		this.handleTerminalResponseDecision(threadId, runId, decision, options.messageGroupId);
+		return decision;
+	}
+
+	private getTerminalGuardEvents(
+		threadId: string,
+		runId: string,
+		messageGroupId?: string,
+	): InstanceAiEvent[] {
+		if (!messageGroupId) return this.eventBus.getEventsForRun(threadId, runId);
+
+		const groupRunIds = this.getRunIdsForMessageGroup(messageGroupId);
+		return groupRunIds.length > 0
+			? this.eventBus.getEventsForRuns(threadId, groupRunIds)
+			: this.eventBus.getEventsForRun(threadId, runId);
+	}
+
+	private handleTerminalResponseDecision(
+		threadId: string,
+		runId: string,
+		decision: TerminalResponseDecision,
+		messageGroupId?: string,
+	): void {
+		this.telemetry.track('instance_ai_terminal_response_decision', {
+			thread_id: threadId,
+			run_id: runId,
+			message_group_id: messageGroupId,
+			source: 'terminal_guard',
+			status: decision.status,
+			action: decision.action,
+			reason: decision.reason,
+			visibility_source: decision.visibilitySource,
+		});
+
+		if (decision.reason === 'completed-after-error') {
+			this.logger.warn('completed_after_error_event', {
+				threadId,
+				runId,
+				messageGroupId,
+			});
+		}
+
+		if (decision.reason === 'confirmation-invalid') {
+			this.logger.warn('invalid_confirmation_payload', {
+				threadId,
+				runId,
+				messageGroupId,
+			});
+		}
+
+		if (decision.action === 'emit' && decision.event) {
+			this.eventBus.publish(threadId, decision.event);
+		}
+	}
+
+	private createTerminalOutcomeStorage(): TerminalOutcomeStorage {
+		this.terminalOutcomeStorage ??= new TerminalOutcomeStorage(
+			createMemory(this.createMemoryConfig()),
+		);
+		return this.terminalOutcomeStorage;
+	}
+
+	private async finishInvalidConfirmationRun(args: {
+		threadId: string;
+		runId: string;
+		abortController: AbortController;
+		snapshotStorage: DbSnapshotStorage;
+		tracing?: InstanceAiTraceContext;
+	}): Promise<MessageTraceFinalization> {
+		this.runState.cancelThread(args.threadId);
+		args.abortController.abort();
+		await this.finalizeRunTracing(args.runId, args.tracing, {
+			status: 'error',
+			reason: 'invalid_confirmation_payload',
+		});
+		this.publishRunFinish(
+			args.threadId,
+			args.runId,
+			'errored',
+			'I need your input to continue, but I could not display the prompt. Please try again.',
+		);
+		await this.saveAgentTreeSnapshot(args.threadId, args.runId, args.snapshotStorage);
+		return {
+			status: 'error',
+			reason: 'invalid_confirmation_payload',
+			metadata: { completion_source: 'orchestrator' },
+		};
+	}
+
+	private buildBackgroundTerminalOutcome(task: ManagedBackgroundTask): TerminalOutcome {
+		const status =
+			task.status === 'failed' ? 'failed' : task.status === 'cancelled' ? 'cancelled' : 'completed';
+		const userFacingMessage =
+			status === 'completed'
+				? `The background ${task.role} task finished.`
+				: status === 'cancelled'
+					? `The background ${task.role} task was cancelled.`
+					: `The background ${task.role} task failed before I could complete that part.`;
+
+		return {
+			id: `${task.messageGroupId ?? task.runId}:${task.taskId}:${status}`,
+			threadId: task.threadId,
+			runId: task.runId,
+			messageGroupId: task.messageGroupId,
+			correlationId: task.messageGroupId,
+			taskId: task.taskId,
+			agentId: task.agentId,
+			status,
+			userFacingMessage,
+			createdAt: new Date().toISOString(),
+		};
+	}
+
+	async replayUndeliveredTerminalOutcomes(
+		threadId: string,
+		options: { delivery?: 'snapshot' | 'event' } = {},
+	): Promise<void> {
+		const storage = this.createTerminalOutcomeStorage();
+		const persistedOutcomes = await storage.getUndelivered(threadId).catch((error) => {
+			this.logger.warn('Failed to load undelivered Instance AI terminal outcomes', {
+				threadId,
+				error: getErrorMessage(error),
+			});
+			return [] as TerminalOutcome[];
+		});
+		const inMemoryOutcomes = [...this.pendingTerminalOutcomes.values()].filter(
+			(outcome) => outcome.threadId === threadId,
+		);
+		const outcomes = new Map<string, TerminalOutcome>();
+		for (const outcome of [...persistedOutcomes, ...inMemoryOutcomes]) {
+			outcomes.set(outcome.id, outcome);
+		}
+		const persistedOutcomeIds = new Set(persistedOutcomes.map((outcome) => outcome.id));
+		const delivery = options.delivery ?? 'snapshot';
+
+		for (const outcome of outcomes.values()) {
+			const responseId = getBackgroundOutcomeResponseId(outcome);
+			let snapshotDelivered = false;
+			try {
+				snapshotDelivered = await this.persistTerminalOutcomeLineToSnapshot(outcome, responseId);
+			} catch (error) {
+				this.logger.warn('Failed to replay Instance AI terminal outcome', {
+					threadId,
+					runId: outcome.runId,
+					taskId: outcome.taskId,
+					error: getErrorMessage(error),
+				});
+				if (delivery === 'event') {
+					const published = this.publishTerminalOutcomeLine(outcome, responseId);
+					this.telemetry.track('instance_ai_terminal_response_decision', {
+						thread_id: threadId,
+						run_id: outcome.runId,
+						message_group_id: outcome.messageGroupId,
+						task_id: outcome.taskId,
+						source: 'terminal_outcome_replay',
+						status: outcome.status,
+						action: published ? 'replay_event' : 'already-emitted',
+						visibility_source: 'background-outcome',
+					});
+				}
+				continue;
+			}
+
+			if (!snapshotDelivered) continue;
+
+			let action = 'replay_snapshot';
+			if (delivery === 'event') {
+				const published = this.publishTerminalOutcomeLine(outcome, responseId);
+				action = published ? 'replay_event' : 'already-emitted';
+			}
+
+			if (persistedOutcomeIds.has(outcome.id)) {
+				await storage
+					.markDelivered(threadId, outcome.id, new Date().toISOString())
+					.catch((error) => {
+						this.logger.warn('Failed to mark Instance AI terminal outcome as delivered', {
+							threadId,
+							runId: outcome.runId,
+							taskId: outcome.taskId,
+							error: getErrorMessage(error),
+						});
+					});
+			}
+			this.pendingTerminalOutcomes.delete(outcome.id);
+			this.telemetry.track('instance_ai_terminal_response_decision', {
+				thread_id: threadId,
+				run_id: outcome.runId,
+				message_group_id: outcome.messageGroupId,
+				task_id: outcome.taskId,
+				source: 'terminal_outcome_replay',
+				status: outcome.status,
+				action,
+				visibility_source: 'background-outcome',
+			});
+		}
+	}
+
+	private async persistTerminalOutcomeLineToSnapshot(
+		outcome: TerminalOutcome,
+		responseId: string,
+	): Promise<boolean> {
+		const snapshot = await this.dbSnapshotStorage.getLatest(outcome.threadId, {
+			messageGroupId: outcome.messageGroupId,
+			runId: outcome.runId,
+		});
+		if (!snapshot) {
+			await this.dbSnapshotStorage.save(
+				outcome.threadId,
+				createTerminalOutcomeAgentTree(outcome, responseId),
+				outcome.runId,
+				{
+					messageGroupId: outcome.messageGroupId,
+					runIds: [outcome.runId],
+				},
+			);
+			return true;
+		}
+
+		const { tree } = appendTerminalOutcomeToAgentTree(snapshot.tree, outcome, responseId);
+		const runIds = new Set(snapshot.runIds ?? [snapshot.runId]);
+		runIds.add(outcome.runId);
+		await this.dbSnapshotStorage.updateLast(outcome.threadId, tree, snapshot.runId, {
+			messageGroupId: snapshot.messageGroupId ?? outcome.messageGroupId,
+			runIds: [...runIds],
+			langsmithRunId: snapshot.langsmithRunId,
+			langsmithTraceId: snapshot.langsmithTraceId,
+		});
+		return true;
+	}
+
+	private publishTerminalOutcomeLine(outcome: TerminalOutcome, responseId: string): boolean {
+		const alreadyPublished = this.eventBus
+			.getEventsForRun(outcome.threadId, outcome.runId)
+			.some((event) => event.responseId === responseId);
+		if (alreadyPublished) return false;
+
+		this.eventBus.publish(outcome.threadId, {
+			type: 'text-delta',
+			runId: outcome.runId,
+			agentId: ORCHESTRATOR_AGENT_ID,
+			responseId,
+			payload: { text: outcome.userFacingMessage },
+		});
+		return true;
+	}
+
+	private async recordBackgroundTerminalOutcome(task: ManagedBackgroundTask): Promise<void> {
+		const outcome = this.buildBackgroundTerminalOutcome(task);
+		let persisted = false;
+		try {
+			await this.createTerminalOutcomeStorage().upsert(task.threadId, outcome);
+			persisted = true;
+		} catch (error) {
+			this.pendingTerminalOutcomes.set(outcome.id, outcome);
+			this.logger.warn('Failed to persist Instance AI terminal outcome', {
+				threadId: task.threadId,
+				runId: task.runId,
+				taskId: task.taskId,
+				error: getErrorMessage(error),
+			});
+			this.telemetry.track('instance_ai_terminal_outcome_persistence_failure', {
+				thread_id: task.threadId,
+				run_id: task.runId,
+				task_id: task.taskId,
+				status: outcome.status,
+				phase: 'metadata',
+			});
+		}
+
+		const responseId = getBackgroundOutcomeResponseId(outcome);
+		const published = this.publishTerminalOutcomeLine(outcome, responseId);
+
+		this.telemetry.track('instance_ai_terminal_response_decision', {
+			thread_id: task.threadId,
+			run_id: task.runId,
+			message_group_id: task.messageGroupId,
+			task_id: task.taskId,
+			source: 'background_outcome',
+			status: outcome.status,
+			action: published ? 'emit' : 'already-emitted',
+			visibility_source: 'background-outcome',
+		});
+
+		let snapshotDelivered = false;
+		try {
+			snapshotDelivered = await this.persistTerminalOutcomeLineToSnapshot(outcome, responseId);
+		} catch (error) {
+			this.logger.warn('Failed to persist Instance AI terminal outcome line to snapshot', {
+				threadId: task.threadId,
+				runId: task.runId,
+				taskId: task.taskId,
+				error: getErrorMessage(error),
+			});
+			this.telemetry.track('instance_ai_terminal_outcome_persistence_failure', {
+				thread_id: task.threadId,
+				run_id: task.runId,
+				task_id: task.taskId,
+				status: outcome.status,
+				phase: 'snapshot',
+			});
+		}
+
+		if (!persisted || !snapshotDelivered) return;
+
+		try {
+			await this.createTerminalOutcomeStorage().markDelivered(
+				task.threadId,
+				outcome.id,
+				new Date().toISOString(),
+			);
+			this.pendingTerminalOutcomes.delete(outcome.id);
+		} catch (error) {
+			this.logger.warn('Failed to mark Instance AI terminal outcome as delivered', {
+				threadId: task.threadId,
+				runId: task.runId,
+				taskId: task.taskId,
+				error: getErrorMessage(error),
+			});
+		}
+	}
+
 	private async syncPlannedTasksToUi(threadId: string, graph: PlannedTaskGraph): Promise<void> {
 		const { taskStorage } = await this.createPlannedTaskState();
 		const tasks = this.projectPlannedTaskList(graph);
@ -1485,7 +1964,11 @@ export class InstanceAiService {
 		messageGroupId?: string,
 		pushRef?: string,
 	) {
-		const localGatewayDisabled = await this.settingsService.isLocalGatewayDisabledForUser(user.id);
+		const localGatewayDisabledGlobally =
+			this.settingsService.getAdminSettings().localGatewayDisabled;
+		const localGatewayDisabledForUser = await this.settingsService.isLocalGatewayDisabledForUser(
+			user.id,
+		);
 		const userGateway = this.gatewayRegistry.findGateway(user.id);

 		// When the proxy is enabled, create a single ProxyTokenManager and
@ -1528,7 +2011,7 @@ export class InstanceAiService {
 			pushRef,
 			threadId,
 		});
-		if (!localGatewayDisabled && userGateway?.isConnected) {
+		if (!localGatewayDisabledForUser && userGateway?.isConnected) {
 			context.localMcpServer = userGateway;
 		}
 		context.permissions = this.settingsService.getPermissions();
@ -1546,14 +2029,19 @@ export class InstanceAiService {
 		context.runId = runId;

 		// Compute gateway status for the system prompt
-		if (localGatewayDisabled) {
-			context.localGatewayStatus = { status: 'disabled' };
-		} else if (userGateway?.isConnected) {
-			context.localGatewayStatus = { status: 'connected' };
+		if (localGatewayDisabledGlobally) {
+			context.localGatewayStatus = { status: 'disabledGlobally' };
+		} else if (!localGatewayDisabledForUser && userGateway?.isConnected) {
+			context.localGatewayStatus = {
+				status: 'connected',
+				capabilities: userGateway
+					.getStatus()
+					.toolCategories.filter(({ enabled }) => enabled)
+					.map(({ name }) => name),
+			};
 		} else {
 			context.localGatewayStatus = {
-				status: 'disconnected',
-				capabilities: ['filesystem', 'browser'],
+				status: localGatewayDisabledForUser ? 'disabled' : 'disconnected',
 			};
 		}

@ -1605,6 +2093,7 @@ export class InstanceAiService {
 			localMcpServer: context.localMcpServer,
 			oauth2CallbackUrl: this.oauth2CallbackUrl,
 			webhookBaseUrl: this.webhookBaseUrl,
+			formBaseUrl: this.formBaseUrl,
 			waitForConfirmation: async (requestId: string) => {
 				return await new Promise<ConfirmationData>((resolve) => {
 					this.runState.registerPendingConfirmation(requestId, {
@ -2007,9 +2496,11 @@ export class InstanceAiService {
 		let tracing: InstanceAiTraceContext | undefined;
 		let messageTraceFinalization: MessageTraceFinalization | undefined;
 		let aiCreatedWorkflowIds: Set<string> | undefined;
+		let activeSnapshotStorage: DbSnapshotStorage | undefined;
+		let messageId = '';

 		try {
-			const messageId = nanoid();
+			messageId = nanoid();

 			// Publish run-start (includes userId for audit trail attribution)
 			this.eventBus.publish(threadId, {
@ -2022,6 +2513,10 @@ export class InstanceAiService {

 			// Check if already cancelled before starting agent work
 			if (signal.aborted) {
+				this.evaluateTerminalResponse(threadId, runId, 'cancelled', {
+					messageGroupId,
+					correlationId: messageId,
+				});
 				this.eventBus.publish(threadId, {
 					type: 'run-finish',
 					runId,
@ -2034,16 +2529,18 @@ export class InstanceAiService {
 			const mcpServers = this.parseMcpServers(this.instanceAiConfig.mcpServers);

 			const executionPushRef = this.threadPushRef.get(threadId);
+			const environment = await this.createExecutionEnvironment(
+				user,
+				threadId,
+				runId,
+				signal,
+				researchMode,
+				messageGroupId,
+				executionPushRef,
+			);
+			activeSnapshotStorage = environment.snapshotStorage;
 			const { context, memory, taskStorage, snapshotStorage, modelId, orchestrationContext } =
-				await this.createExecutionEnvironment(
-					user,
-					threadId,
-					runId,
-					signal,
-					researchMode,
-					messageGroupId,
-					executionPushRef,
-				);
+				environment;
 			aiCreatedWorkflowIds = context.aiCreatedWorkflowIds ??= new Set<string>();
 			// Make the current user message available to sub-agents (e.g. planner)
 			// since memory.recall() only returns previously-saved messages.
@ -2136,6 +2633,7 @@ export class InstanceAiService {
 				context,
 				orchestrationContext,
 				mcpServers,
+				mcpManager: this.mcpClientManager,
 				memoryConfig,
 				memory,
 				disableDeferredTools: true,
@ -2371,6 +2869,27 @@ export class InstanceAiService {
 					});
 				}

+				const waitingDecision = this.evaluateWaitingResponse(
+					threadId,
+					runId,
+					result.confirmationEvent,
+					{
+						messageGroupId,
+						correlationId: messageId,
+					},
+				);
+
+				if (waitingDecision?.reason === 'confirmation-invalid') {
+					messageTraceFinalization = await this.finishInvalidConfirmationRun({
+						threadId,
+						runId,
+						abortController,
+						snapshotStorage,
+						tracing,
+					});
+					return;
+				}
+
 				if (result.confirmationEvent) {
 					this.trackConfirmationRequest(threadId, result.confirmationEvent);
 					this.eventBus.publish(threadId, result.confirmationEvent);
@ -2384,6 +2903,11 @@ export class InstanceAiService {
 			}

 			const outputText = await (result.text ?? Promise.resolve(''));
+			this.evaluateTerminalResponse(threadId, runId, result.status, {
+				messageGroupId,
+				correlationId: messageId,
+				workSummary: result.workSummary,
+			});
 			const finalStatus = result.status === 'errored' ? 'error' : result.status;
 			await this.finalizeRunTracing(runId, tracing, {
 				status: finalStatus,
@ -2420,6 +2944,10 @@ export class InstanceAiService {
 			}
 		} catch (error) {
 			if (signal.aborted) {
+				this.evaluateTerminalResponse(threadId, runId, 'cancelled', {
+					messageGroupId,
+					correlationId: messageId,
+				});
 				await this.finalizeRunTracing(runId, tracing, {
 					status: 'cancelled',
 					reason: 'user_cancelled',
@ -2435,16 +2963,25 @@ export class InstanceAiService {
 					aiCreatedWorkflowIds,
 				);
 				this.publishRunFinish(threadId, runId, 'cancelled', 'user_cancelled', archivedWorkflowIds);
+				if (activeSnapshotStorage) {
+					await this.saveAgentTreeSnapshot(threadId, runId, activeSnapshotStorage);
+				}
 				return;
 			}

 			const errorMessage = getErrorMessage(error);
+			const userFacingErrorMessage = getUserFacingErrorMessage(error);

 			this.logger.error('Instance AI run error', {
 				error: errorMessage,
 				threadId,
 				runId,
 			});
+			this.evaluateTerminalResponse(threadId, runId, 'errored', {
+				messageGroupId,
+				correlationId: messageId,
+				errorMessage: userFacingErrorMessage,
+			});
 			await this.finalizeRunTracing(runId, tracing, {
 				status: 'error',
 				reason: errorMessage,
@ -2466,10 +3003,13 @@ export class InstanceAiService {
 				agentId: ORCHESTRATOR_AGENT_ID,
 				payload: {
 					status: 'error',
-					reason: errorMessage,
+					reason: userFacingErrorMessage,
 					...(archivedWorkflowIds.length > 0 ? { archivedWorkflowIds } : {}),
 				},
 			});
+			if (activeSnapshotStorage) {
+				await this.saveAgentTreeSnapshot(threadId, runId, activeSnapshotStorage);
+			}
 		} finally {
 			this.runState.clearActiveRun(threadId);
 			// Note: don't delete threadPushRef here. Planned tasks (build agent,
@ -2864,6 +3404,25 @@ export class InstanceAiService {
 					});
 				}

+				const messageGroupId = this.traceContextsByRunId.get(opts.runId)?.messageGroupId;
+				const waitingDecision = this.evaluateWaitingResponse(
+					opts.threadId,
+					opts.runId,
+					result.confirmationEvent,
+					{ messageGroupId },
+				);
+
+				if (waitingDecision?.reason === 'confirmation-invalid') {
+					messageTraceFinalization = await this.finishInvalidConfirmationRun({
+						threadId: opts.threadId,
+						runId: opts.runId,
+						abortController: opts.abortController,
+						snapshotStorage: opts.snapshotStorage,
+						tracing: opts.tracing,
+					});
+					return;
+				}
+
 				if (result.confirmationEvent) {
 					this.trackConfirmationRequest(opts.threadId, result.confirmationEvent);
 					this.eventBus.publish(opts.threadId, result.confirmationEvent);
@ -2877,6 +3436,11 @@ export class InstanceAiService {
 			}

 			const outputText = await (result.text ?? Promise.resolve(''));
+			const messageGroupId = this.traceContextsByRunId.get(opts.runId)?.messageGroupId;
+			this.evaluateTerminalResponse(opts.threadId, opts.runId, result.status, {
+				messageGroupId,
+				workSummary: result.workSummary,
+			});
 			const finalStatus = result.status === 'errored' ? 'error' : result.status;
 			await this.finalizeRunTracing(opts.runId, opts.tracing, {
 				status: finalStatus,
@ -2907,6 +3471,10 @@ export class InstanceAiService {
 			}
 		} catch (error) {
 			if (opts.signal.aborted) {
+				const messageGroupId = this.traceContextsByRunId.get(opts.runId)?.messageGroupId;
+				this.evaluateTerminalResponse(opts.threadId, opts.runId, 'cancelled', {
+					messageGroupId,
+				});
 				await this.finalizeRunTracing(opts.runId, opts.tracing, {
 					status: 'cancelled',
 					reason: 'user_cancelled',
@ -2928,16 +3496,23 @@ export class InstanceAiService {
 					'user_cancelled',
 					archivedWorkflowIds,
 				);
+				await this.saveAgentTreeSnapshot(opts.threadId, opts.runId, opts.snapshotStorage);
 				return;
 			}

 			const errorMessage = getErrorMessage(error);
+			const userFacingErrorMessage = getUserFacingErrorMessage(error);

 			this.logger.error('Instance AI resumed run error', {
 				error: errorMessage,
 				threadId: opts.threadId,
 				runId: opts.runId,
 			});
+			const messageGroupId = this.traceContextsByRunId.get(opts.runId)?.messageGroupId;
+			this.evaluateTerminalResponse(opts.threadId, opts.runId, 'errored', {
+				messageGroupId,
+				errorMessage: userFacingErrorMessage,
+			});
 			await this.finalizeRunTracing(opts.runId, opts.tracing, {
 				status: 'error',
 				reason: errorMessage,
@ -2959,10 +3534,11 @@ export class InstanceAiService {
 				agentId: ORCHESTRATOR_AGENT_ID,
 				payload: {
 					status: 'error',
-					reason: errorMessage,
+					reason: userFacingErrorMessage,
 					...(archivedWorkflowIds.length > 0 ? { archivedWorkflowIds } : {}),
 				},
 			});
+			await this.saveAgentTreeSnapshot(opts.threadId, opts.runId, opts.snapshotStorage);
 		} finally {
 			this.runState.clearActiveRun(opts.threadId);
 			// See note in executeRun's finally — keep threadPushRef alive for
@ -3064,6 +3640,7 @@ export class InstanceAiService {
 				}
 			},
 			onSettled: async (task) => {
+				await this.recordBackgroundTerminalOutcome(task);
 				await this.saveAgentTreeSnapshot(
 					opts.threadId,
 					runId,
@ -3425,11 +4002,9 @@ export class InstanceAiService {
 		options?: { userId?: string; modelId?: ModelConfig; archivedWorkflowIds?: string[] },
 	): Promise<void> {
 		this.publishRunFinish(threadId, runId, status, undefined, options?.archivedWorkflowIds);
-		if (status === 'completed') {
-			await this.saveAgentTreeSnapshot(threadId, runId, snapshotStorage);
-			if (options?.userId && options?.modelId) {
-				void this.refineTitleIfNeeded(threadId, options.userId, options.modelId);
-			}
+		await this.saveAgentTreeSnapshot(threadId, runId, snapshotStorage);
+		if (status === 'completed' && options?.userId && options?.modelId) {
+			void this.refineTitleIfNeeded(threadId, options.userId, options.modelId);
 		}
 	}

@ -3524,10 +4099,22 @@ export class InstanceAiService {
 			let groupRunIds: string[] | undefined;
 			if (messageGroupId) {
 				groupRunIds = this.getRunIdsForMessageGroup(messageGroupId);
+				if (groupRunIds.length === 0) {
+					const snapshot = await snapshotStorage.getLatest(threadId, { messageGroupId, runId });
+					groupRunIds = snapshot?.runIds?.length ? snapshot.runIds : [runId];
+				}
 				events = this.eventBus.getEventsForRuns(threadId, groupRunIds);
 			} else {
 				events = this.eventBus.getEventsForRun(threadId, runId);
 			}
+			if (isUpdate && events.length === 0) {
+				this.logger.warn('Skipped updating empty Instance AI agent tree snapshot', {
+					threadId,
+					runId,
+					messageGroupId,
+				});
+				return;
+			}
 			const agentTree = buildAgentTreeFromEvents(events);

 			const tracing = this.traceContextsByRunId.get(runId)?.tracing;
--- a/packages/cli/src/modules/instance-ai/message-parser.ts
+++ b/packages/cli/src/modules/instance-ai/message-parser.ts
@ -182,6 +182,45 @@ function buildFlatAgentTree(
 	};
 }

+function snapshotTimestamp(snapshot: AgentTreeSnapshot): string {
+	return (snapshot.updatedAt ?? snapshot.createdAt ?? new Date(0)).toISOString();
+}
+
+function snapshotCreatedAtMs(snapshot: AgentTreeSnapshot): number | undefined {
+	return snapshot.createdAt?.getTime();
+}
+
+function messageCreatedAtMs(message: MastraDBMessage): number {
+	return message.createdAt.getTime();
+}
+
+function getNextConversationMessageTimestamp(
+	messages: MastraDBMessage[],
+	currentIndex: number,
+): number | undefined {
+	for (let i = currentIndex + 1; i < messages.length; i++) {
+		const role = messages[i].role;
+		if (role === 'user' || role === 'assistant') return messageCreatedAtMs(messages[i]);
+	}
+	return undefined;
+}
+
+function buildSnapshotMessage(snapshot: AgentTreeSnapshot): InstanceAiMessage {
+	const groupId = snapshot.messageGroupId ?? snapshot.runId;
+	return {
+		id: groupId,
+		runId: snapshot.runId,
+		messageGroupId: snapshot.messageGroupId,
+		runIds: snapshot.runIds,
+		role: 'assistant',
+		createdAt: snapshotTimestamp(snapshot),
+		content: snapshot.tree.textContent,
+		reasoning: snapshot.tree.reasoning,
+		isStreaming: false,
+		agentTree: snapshot.tree,
+	};
+}
+
 // ---------------------------------------------------------------------------
 // Main parser
 // ---------------------------------------------------------------------------
@ -195,17 +234,75 @@ export function parseStoredMessages(
 	snapshots?: RunSnapshots,
 ): InstanceAiMessage[] {
 	const messages: InstanceAiMessage[] = [];
+	const snapshotList = snapshots ?? [];

-	// Snapshots are stored as a chronological array — the Nth snapshot
-	// corresponds to the Nth assistant message. We align from the END
-	// so old messages (before snapshots existed) get flat trees.
+	// Snapshots are stored chronologically. DB-backed snapshots have timestamps,
+	// so use them to place orphan snapshots before, between, or after assistant
+	// rows. Older tests and legacy snapshots may not have timestamps; for those,
+	// keep the positional alignment behavior.
 	const assistantCount = mastraMessages.filter((m) => m.role === 'assistant').length;
-	const snapshotOffset = assistantCount - (snapshots?.length ?? 0);
+	const hasSnapshotTimestamps = snapshotList.some((snapshot) => snapshot.createdAt !== undefined);
+	const snapshotCount = snapshotList.length;
+	const snapshotOffset =
+		!hasSnapshotTimestamps && snapshotCount <= assistantCount ? assistantCount - snapshotCount : 0;
 	let assistantIdx = 0;
+	let nextSnapshotIdx = 0;
+	const consumedSnapshots = new Set<AgentTreeSnapshot>();

 	let lastUserMessageId: string | undefined;

-	for (const msg of mastraMessages) {
+	function appendChronologicalOrphansBefore(message: MastraDBMessage): void {
+		if (!hasSnapshotTimestamps) return;
+
+		const messageTimestamp = messageCreatedAtMs(message);
+		while (nextSnapshotIdx < snapshotList.length) {
+			const snapshot = snapshotList[nextSnapshotIdx];
+			const snapshotTimestamp = snapshotCreatedAtMs(snapshot);
+			if (snapshotTimestamp === undefined || snapshotTimestamp >= messageTimestamp) return;
+
+			consumedSnapshots.add(snapshot);
+			messages.push(buildSnapshotMessage(snapshot));
+			nextSnapshotIdx++;
+		}
+	}
+
+	function takeSnapshotForAssistant(
+		message: MastraDBMessage,
+		messageIndex: number,
+	): AgentTreeSnapshot | undefined {
+		if (!hasSnapshotTimestamps) {
+			const snapshotIdx = assistantIdx - snapshotOffset;
+			const snapshot =
+				snapshotIdx >= 0 && snapshotIdx < snapshotList.length
+					? snapshotList[snapshotIdx]
+					: undefined;
+			if (snapshot) consumedSnapshots.add(snapshot);
+			return snapshot;
+		}
+
+		appendChronologicalOrphansBefore(message);
+
+		const snapshot = snapshotList[nextSnapshotIdx];
+		if (!snapshot) return undefined;
+
+		const nextMessageTimestamp = getNextConversationMessageTimestamp(mastraMessages, messageIndex);
+		const snapshotTimestamp = snapshotCreatedAtMs(snapshot);
+		if (
+			snapshotTimestamp !== undefined &&
+			nextMessageTimestamp !== undefined &&
+			snapshotTimestamp > nextMessageTimestamp
+		) {
+			return undefined;
+		}
+
+		consumedSnapshots.add(snapshot);
+		nextSnapshotIdx++;
+		return snapshot;
+	}
+
+	for (const [messageIndex, msg] of mastraMessages.entries()) {
+		appendChronologicalOrphansBefore(msg);
+
 		const text = extractTextFromContent(msg.content);

 		if (msg.role === 'user') {
@ -232,12 +329,7 @@ export function parseStoredMessages(
 			const toolCalls = invocations.map(buildToolCallState);
 			const parts = extractParts(msg.content);

-			// Match snapshot by position: Nth assistant message → Nth snapshot (aligned from end)
-			const snapshotIdx = assistantIdx - snapshotOffset;
-			const snapshot =
-				snapshots && snapshotIdx >= 0 && snapshotIdx < snapshots.length
-					? snapshots[snapshotIdx]
-					: undefined;
+			const snapshot = takeSnapshotForAssistant(msg, messageIndex);
 			assistantIdx++;

 			// Use the native runId from the snapshot (matches SSE events),
@ -268,6 +360,11 @@ export function parseStoredMessages(
 		// in the assistant message's content
 	}

+	for (const snapshot of snapshots ?? []) {
+		if (consumedSnapshots.has(snapshot)) continue;
+		messages.push(buildSnapshotMessage(snapshot));
+	}
+
 	// Deduplicate assistant messages by messageGroupId.
 	// Follow-up runs in the same group produce separate DB rows; keep only
 	// the latest (which carries the full runIds array and complete tree).
--- a/packages/cli/src/modules/instance-ai/storage/db-snapshot-storage.ts
+++ b/packages/cli/src/modules/instance-ai/storage/db-snapshot-storage.ts
@ -46,6 +46,8 @@ export class DbSnapshotStorage {
 			runIds: row.runIds ?? undefined,
 			langsmithRunId: row.langsmithRunId ?? undefined,
 			langsmithTraceId: row.langsmithTraceId ?? undefined,
+			createdAt: row.createdAt,
+			updatedAt: row.updatedAt,
 		};
 	}

@ -133,6 +135,8 @@ export class DbSnapshotStorage {
 			runIds: r.runIds ?? undefined,
 			langsmithRunId: r.langsmithRunId ?? undefined,
 			langsmithTraceId: r.langsmithTraceId ?? undefined,
+			createdAt: r.createdAt,
+			updatedAt: r.updatedAt,
 		}));
 	}

--- a/packages/cli/src/modules/instance-ai/storage/typeorm-memory-storage.ts
+++ b/packages/cli/src/modules/instance-ai/storage/typeorm-memory-storage.ts
@ -41,6 +41,7 @@ import { InstanceAiThreadRepository } from '../repositories/instance-ai-thread.r
 const PATCH_ONLY_METADATA_KEYS = [
 	'instanceAiPlannedTasks',
 	'instanceAiTasks',
+	'instanceAiTerminalOutcomes',
 	'instanceAiWorkflowLoop',
 ] as const;

--- a/packages/cli/src/modules/mcp/tools/workflow-builder/create-workflow-from-code.tool.ts
+++ b/packages/cli/src/modules/mcp/tools/workflow-builder/create-workflow-from-code.tool.ts
@ -1,5 +1,4 @@
 import { type User, type ProjectRepository, WorkflowEntity } from '@n8n/db';
-import { layoutWorkflowJSON } from '@n8n/workflow-sdk';
 import z from 'zod';

 import { MCP_CREATE_WORKFLOW_FROM_CODE_TOOL, CODE_BUILDER_VALIDATE_TOOL } from './constants';
@ -149,7 +148,7 @@ export const createCreateWorkflowFromCodeTool = (
 			const strippedCode = stripImportStatements(code);
 			const result = await handler.parseAndValidate(strippedCode);

-			const workflowJson = layoutWorkflowJSON(result.workflow);
+			const workflowJson = result.workflow;

 			newWorkflow = new WorkflowEntity();
 			Object.assign(newWorkflow, {
--- a/packages/cli/src/modules/mcp/tools/workflow-builder/update-workflow.tool.ts
+++ b/packages/cli/src/modules/mcp/tools/workflow-builder/update-workflow.tool.ts
@ -1,5 +1,4 @@
 import { type User, type SharedWorkflowRepository, WorkflowEntity } from '@n8n/db';
-import { layoutWorkflowJSON } from '@n8n/workflow-sdk';
 import z from 'zod';

 import { USER_CALLED_MCP_TOOL_EVENT } from '../../mcp.constants';
@ -132,7 +131,7 @@ export const createUpdateWorkflowTool = (
 			const strippedCode = stripImportStatements(code);
 			const result = await handler.parseAndValidate(strippedCode);

-			const workflowJson = layoutWorkflowJSON(result.workflow);
+			const workflowJson = result.workflow;

 			const workflowUpdateData = new WorkflowEntity();
 			Object.assign(workflowUpdateData, {
--- a/packages/cli/src/public-api/v1/handlers/audit/audit.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/audit/audit.handler.ts
@ -1,25 +1,27 @@
 import { Container } from '@n8n/di';
-import type { Response } from 'express';

 import type { AuditRequest } from '@/public-api/types';

+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import { apiKeyHasScopeWithGlobalScopeFallback } from '../../shared/middlewares/global.middleware';

-export = {
+type AuditHandlers = {
+	generateAudit: PublicAPIEndpoint<AuditRequest.Generate>;
+};
+
+const auditHandlers: AuditHandlers = {
 	generateAudit: [
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'securityAudit:generate' }),
-		async (req: AuditRequest.Generate, res: Response): Promise<Response> => {
-			try {
-				const { SecurityAuditService } = await import('@/security-audit/security-audit.service');
-				const result = await Container.get(SecurityAuditService).run(
-					req.body?.additionalOptions?.categories,
-					req.body?.additionalOptions?.daysAbandonedWorkflow,
-				);
+		async (req, res) => {
+			const { SecurityAuditService } = await import('@/security-audit/security-audit.service');
+			const result = await Container.get(SecurityAuditService).run(
+				req.body?.additionalOptions?.categories,
+				req.body?.additionalOptions?.daysAbandonedWorkflow,
+			);

-				return res.json(result);
-			} catch (error) {
-				return res.status(500).json(error);
-			}
+			return res.json(result);
 		},
 	],
 };
+
+export = auditHandlers;
--- a/packages/cli/src/public-api/v1/handlers/community-packages/tests/community-packages.handler.test.ts
+++ b/packages/cli/src/public-api/v1/handlers/community-packages/tests/community-packages.handler.test.ts
@ -1,15 +1,16 @@
 import { mockInstance } from '@n8n/backend-test-utils';
 import { Container } from '@n8n/di';
 import type { Response } from 'express';
+import { mock } from 'jest-mock-extended';

 import { RESPONSE_ERROR_MESSAGES } from '@/constants';
 import { BadRequestError } from '@/errors/response-errors/bad-request.error';
 import { NotFoundError } from '@/errors/response-errors/not-found.error';
-import type { InstalledPackages } from '@/modules/community-packages/installed-packages.entity';
 import { CommunityPackagesLifecycleService } from '@/modules/community-packages/community-packages.lifecycle.service';
+import type { InstalledPackages } from '@/modules/community-packages/installed-packages.entity';
 import * as middlewares from '@/public-api/v1/shared/middlewares/global.middleware';
+
 import { mapToCommunityPackage, mapToCommunityPackageList } from '../community-packages.mapper';
-import { mock } from 'jest-mock-extended';

 const mockMiddleware = jest.fn(async (_req: unknown, _res: unknown, next: unknown) =>
 	(next as () => void)(),
@ -85,7 +86,7 @@ describe('CommunityPackages Handler', () => {
 			);
 		});

-		it('should return 400 when name is missing', async () => {
+		it('should throw BadRequestError when name is missing', async () => {
 			const req = {
 				body: {},
 				user: mockUser,
@ -95,12 +96,21 @@ describe('CommunityPackages Handler', () => {
 				new BadRequestError(RESPONSE_ERROR_MESSAGES.PACKAGE_NAME_NOT_PROVIDED),
 			);

-			await handler.installPackage[handler.installPackage.length - 1](req, mockResponse);
-
-			expect(mockResponse.status).toHaveBeenCalledWith(400);
+			const handlerFn = handler.installPackage[handler.installPackage.length - 1];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse);
+			} catch (error) {
+				caught = error;
+			}
+			expect(caught).toBeInstanceOf(BadRequestError);
+			expect(caught).toMatchObject({
+				message: RESPONSE_ERROR_MESSAGES.PACKAGE_NAME_NOT_PROVIDED,
+				httpStatusCode: 400,
+			});
 		});

-		it('should return 400 when package is already installed', async () => {
+		it('should throw BadRequestError when package is already installed', async () => {
 			const req = {
 				body: { name: 'n8n-nodes-test' },
 				user: mockUser,
@ -110,9 +120,18 @@ describe('CommunityPackages Handler', () => {
 				new BadRequestError('Package "n8n-nodes-test" is already installed'),
 			);

-			await handler.installPackage[handler.installPackage.length - 1](req, mockResponse);
-
-			expect(mockResponse.status).toHaveBeenCalledWith(400);
+			const handlerFn = handler.installPackage[handler.installPackage.length - 1];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse);
+			} catch (error) {
+				caught = error;
+			}
+			expect(caught).toBeInstanceOf(BadRequestError);
+			expect(caught).toMatchObject({
+				message: 'Package "n8n-nodes-test" is already installed',
+				httpStatusCode: 400,
+			});
 		});
 	});

@ -171,7 +190,7 @@ describe('CommunityPackages Handler', () => {
 			expect(mockResponse.json).toHaveBeenCalledWith(mapToCommunityPackage(updatedPackage));
 		});

-		it('should return 404 when package is not installed', async () => {
+		it('should throw NotFoundError when package is not installed', async () => {
 			const req = {
 				params: { name: 'n8n-nodes-missing' },
 				body: {},
@ -182,9 +201,18 @@ describe('CommunityPackages Handler', () => {
 				new NotFoundError(RESPONSE_ERROR_MESSAGES.PACKAGE_NOT_INSTALLED),
 			);

-			await handler.updatePackage[handler.updatePackage.length - 1](req, mockResponse);
-
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
+			const handlerFn = handler.updatePackage[handler.updatePackage.length - 1];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse);
+			} catch (error) {
+				caught = error;
+			}
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
+				message: RESPONSE_ERROR_MESSAGES.PACKAGE_NOT_INSTALLED,
+				httpStatusCode: 404,
+			});
 		});
 	});

@ -203,7 +231,7 @@ describe('CommunityPackages Handler', () => {
 			expect(mockResponse.status).toHaveBeenCalledWith(204);
 		});

-		it('should return 404 when package is not installed', async () => {
+		it('should throw NotFoundError when package is not installed', async () => {
 			const req = {
 				params: { name: 'n8n-nodes-missing' },
 				user: mockUser,
@ -213,9 +241,18 @@ describe('CommunityPackages Handler', () => {
 				new NotFoundError(RESPONSE_ERROR_MESSAGES.PACKAGE_NOT_INSTALLED),
 			);

-			await handler.uninstallPackage[handler.uninstallPackage.length - 1](req, mockResponse);
-
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
+			const handlerFn = handler.uninstallPackage[handler.uninstallPackage.length - 1];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse);
+			} catch (error) {
+				caught = error;
+			}
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
+				message: RESPONSE_ERROR_MESSAGES.PACKAGE_NOT_INSTALLED,
+				httpStatusCode: 404,
+			});
 		});
 	});
 });
--- a/packages/cli/src/public-api/v1/handlers/community-packages/community-packages.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/community-packages/community-packages.handler.ts
@ -1,52 +1,50 @@
 import type { AuthenticatedRequest } from '@n8n/db';
 import { Container } from '@n8n/di';
-import type express from 'express';

-import { ResponseError } from '@/errors/response-errors/abstract/response.error';
 import { CommunityPackagesLifecycleService } from '@/modules/community-packages/community-packages.lifecycle.service';

 import { mapToCommunityPackage, mapToCommunityPackageList } from './community-packages.mapper';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import { publicApiScope } from '../../shared/middlewares/global.middleware';

-function sendResponseError(res: express.Response, error: ResponseError): express.Response {
-	return res.status(error.httpStatusCode).json({ message: error.message });
-}
+type InstallPackageRequest = AuthenticatedRequest<
+	{},
+	{},
+	{ name: string; version?: string; verify?: boolean }
+>;

-export = {
+type UpdatePackageRequest = AuthenticatedRequest<
+	{ name: string },
+	{},
+	{ version?: string; verify?: boolean }
+>;
+
+type CommunityPackageHandlers = {
+	installPackage: PublicAPIEndpoint<InstallPackageRequest>;
+	getInstalledPackages: PublicAPIEndpoint<AuthenticatedRequest>;
+	updatePackage: PublicAPIEndpoint<UpdatePackageRequest>;
+	uninstallPackage: PublicAPIEndpoint<AuthenticatedRequest<{ name: string }>>;
+};
+
+const communityPackageHandlers: CommunityPackageHandlers = {
 	installPackage: [
 		publicApiScope('communityPackage:install'),
-		async (
-			req: AuthenticatedRequest<
-				Record<string, never>,
-				unknown,
-				{ name: string; version?: string; verify?: boolean }
-			>,
-			res: express.Response,
-		): Promise<express.Response> => {
+		async (req, res) => {
 			const lifecycle = Container.get(CommunityPackagesLifecycleService);

-			try {
-				const installedPackage = await lifecycle.install(
-					{ name: req.body.name, version: req.body.version, verify: req.body.verify ?? true },
-					req.user,
-					'publicApi',
-				);
-				return res.json(mapToCommunityPackage(installedPackage));
-			} catch (error) {
-				if (error instanceof ResponseError) {
-					return sendResponseError(res, error);
-				}
-				throw error;
-			}
+			const installedPackage = await lifecycle.install(
+				{ name: req.body.name, version: req.body.version, verify: req.body.verify ?? true },
+				req.user,
+				'publicApi',
+			);
+
+			return res.json(mapToCommunityPackage(installedPackage));
 		},
 	],

 	getInstalledPackages: [
 		publicApiScope('communityPackage:list'),
-		async (
-			_req: AuthenticatedRequest<Record<string, never>>,
-			res: express.Response,
-		): Promise<express.Response> => {
+		async (_req, res) => {
 			const lifecycle = Container.get(CommunityPackagesLifecycleService);
 			const packages = await lifecycle.listInstalledPackages();
 			return res.json(mapToCommunityPackageList(packages));
@ -55,53 +53,31 @@ export = {

 	updatePackage: [
 		publicApiScope('communityPackage:update'),
-		async (
-			req: AuthenticatedRequest<
-				{ name: string },
-				Record<string, never>,
-				{ version?: string; verify?: boolean }
-			>,
-			res: express.Response,
-		): Promise<express.Response> => {
+		async (req, res) => {
 			const lifecycle = Container.get(CommunityPackagesLifecycleService);

-			try {
-				const updated = await lifecycle.update(
-					{
-						name: req.params.name,
-						version: req.body?.version,
-						verify: req.body?.verify ?? true,
-					},
-					req.user,
-					'notFound',
-				);
-				return res.json(mapToCommunityPackage(updated));
-			} catch (error) {
-				if (error instanceof ResponseError) {
-					return sendResponseError(res, error);
-				}
-				throw error;
-			}
+			const updated = await lifecycle.update(
+				{
+					name: req.params.name,
+					version: req.body?.version,
+					verify: req.body?.verify ?? true,
+				},
+				req.user,
+				'notFound',
+			);
+			return res.json(mapToCommunityPackage(updated));
 		},
 	],

 	uninstallPackage: [
 		publicApiScope('communityPackage:uninstall'),
-		async (
-			req: AuthenticatedRequest<{ name: string }>,
-			res: express.Response,
-		): Promise<express.Response> => {
+		async (req, res) => {
 			const lifecycle = Container.get(CommunityPackagesLifecycleService);

-			try {
-				await lifecycle.uninstall(req.params.name, req.user, 'notFound');
-				return res.status(204).send();
-			} catch (error) {
-				if (error instanceof ResponseError) {
-					return sendResponseError(res, error);
-				}
-				throw error;
-			}
+			await lifecycle.uninstall(req.params.name, req.user, 'notFound');
+			return res.status(204).send();
 		},
 	],
 };
+
+export = communityPackageHandlers;
--- a/packages/cli/src/public-api/v1/handlers/credentials/credentials.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/credentials/credentials.handler.ts
@ -1,11 +1,8 @@
-/* eslint-disable @typescript-eslint/no-unsafe-argument */
 import { LicenseState } from '@n8n/backend-common';
-import type { PublicApiCredentialResponse } from '@n8n/api-types';
 import type { CredentialsEntity } from '@n8n/db';
 import { CredentialsRepository } from '@n8n/db';
 import { Container } from '@n8n/di';
 import { hasGlobalScope } from '@n8n/permissions';
-import type express from 'express';
 import { z } from 'zod';

 import { CredentialTypes } from '@/credential-types';
@ -13,8 +10,11 @@ import { CredentialsService } from '@/credentials/credentials.service';
 import { EnterpriseCredentialsService } from '@/credentials/credentials.service.ee';
 import { CredentialsHelper } from '@/credentials-helper';
 import { CredentialNotFoundError } from '@/errors/credential-not-found.error';
+import { BadRequestError } from '@/errors/response-errors/bad-request.error';
+import { ForbiddenError } from '@/errors/response-errors/forbidden.error';
 import { NotFoundError } from '@/errors/response-errors/not-found.error';

+import { toPublicApiCredentialResponse } from './credentials.mapper';
 import {
 	validCredentialsProperties,
 	validCredentialType,
@ -32,8 +32,8 @@ import {
 	toJsonSchema,
 	updateCredential,
 } from './credentials.service';
-import { toPublicApiCredentialResponse } from './credentials.mapper';
 import type { CredentialTypeRequest, CredentialRequest } from '../../../types';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import {
 	publicApiScope,
 	apiKeyHasScopeWithGlobalScopeFallback,
@ -41,29 +41,23 @@ import {
 	validCursor,
 } from '../../shared/middlewares/global.middleware';
 import { encodeNextCursor } from '../../shared/services/pagination.service';
-import { ForbiddenError } from '@/errors/response-errors/forbidden.error';
-import { BadRequestError } from '@/errors/response-errors/bad-request.error';

-export = {
+type CredentialsHandlers = {
+	getCredentials: PublicAPIEndpoint<CredentialRequest.GetAll>;
+	getCredential: PublicAPIEndpoint<CredentialRequest.Get>;
+	testCredential: PublicAPIEndpoint<CredentialRequest.Test>;
+	createCredential: PublicAPIEndpoint<CredentialRequest.Create>;
+	updateCredential: PublicAPIEndpoint<CredentialRequest.Update>;
+	transferCredential: PublicAPIEndpoint<CredentialRequest.Transfer>;
+	deleteCredential: PublicAPIEndpoint<CredentialRequest.Delete>;
+	getCredentialType: PublicAPIEndpoint<CredentialTypeRequest.Get>;
+};
+
+const credentialsHandlers: CredentialsHandlers = {
 	getCredentials: [
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'credential:list' }),
 		validCursor,
-		async (
-			req: CredentialRequest.GetAll,
-			res: express.Response,
-		): Promise<
-			express.Response<{
-				data: Array<{
-					id: string;
-					name: string;
-					type: string;
-					createdAt: Date;
-					updatedAt: Date;
-					shared: ReturnType<typeof buildSharedForCredential>;
-				}>;
-				nextCursor: string | null;
-			}>
-		> => {
+		async (req, res) => {
 			const offset = Number(req.query.offset) || 0;
 			const limit = Math.min(Number(req.query.limit) || 100, 250);

@ -101,10 +95,7 @@ export = {
 	getCredential: [
 		publicApiScope('credential:read'),
 		projectScope('credential:read', 'credential'),
-		async (
-			req: CredentialRequest.Get,
-			res: express.Response,
-		): Promise<express.Response<PublicApiCredentialResponse>> => {
+		async (req, res) => {
 			const { id: credentialId } = req.params;

 			const credential = await getCredential(credentialId);
@ -118,12 +109,7 @@ export = {
 	testCredential: [
 		publicApiScope('credential:read'),
 		projectScope('credential:read', 'credential'),
-		async (
-			req: CredentialRequest.Test,
-			res: express.Response<{ status: 'OK' | 'Error'; message: string } | { message: string }>,
-		): Promise<
-			express.Response<{ status: 'OK' | 'Error'; message: string } | { message: string }>
-		> => {
+		async (req, res) => {
 			const { id: credentialId } = req.params;
 			try {
 				const credentialTestResult = await Container.get(CredentialsService).testById(
@ -144,10 +130,7 @@ export = {
 		validCredentialType,
 		validCredentialsProperties,
 		publicApiScope('credential:create'),
-		async (
-			req: CredentialRequest.Create,
-			res: express.Response,
-		): Promise<express.Response<PublicApiCredentialResponse>> => {
+		async (req, res) => {
 			const savedCredential = await saveCredential(req.body, req.user);
 			return res.json(savedCredential);
 		},
@ -157,10 +140,7 @@ export = {
 		validCredentialsPropertiesForUpdate,
 		publicApiScope('credential:update'),
 		projectScope('credential:update', 'credential'),
-		async (
-			req: CredentialRequest.Update,
-			res: express.Response,
-		): Promise<express.Response<PublicApiCredentialResponse>> => {
+		async (req, res) => {
 			const { id: credentialId } = req.params;

 			const existingCredential = await getCredential(credentialId);
@ -197,7 +177,7 @@ export = {
 	transferCredential: [
 		publicApiScope('credential:move'),
 		projectScope('credential:move', 'credential'),
-		async (req: CredentialRequest.Transfer, res: express.Response) => {
+		async (req, res) => {
 			const body = z.object({ destinationProjectId: z.string() }).parse(req.body);

 			await Container.get(EnterpriseCredentialsService).transferOne(
@ -206,16 +186,13 @@ export = {
 				body.destinationProjectId,
 			);

-			res.status(204).send();
+			return res.status(204).send();
 		},
 	],
 	deleteCredential: [
 		publicApiScope('credential:delete'),
 		projectScope('credential:delete', 'credential'),
-		async (
-			req: CredentialRequest.Delete,
-			res: express.Response,
-		): Promise<express.Response<Partial<CredentialsEntity>>> => {
+		async (req, res) => {
 			const { id: credentialId } = req.params;
 			let credential: CredentialsEntity | undefined;

@ -239,7 +216,7 @@ export = {
 	],

 	getCredentialType: [
-		async (req: CredentialTypeRequest.Get, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const { credentialTypeName } = req.params;

 			try {
@ -256,3 +233,5 @@ export = {
 		},
 	],
 };
+
+export = credentialsHandlers;
--- a/packages/cli/src/public-api/v1/handlers/data-tables/tests/data-tables.handler.test.ts
+++ b/packages/cli/src/public-api/v1/handlers/data-tables/tests/data-tables.handler.test.ts
@ -3,6 +3,8 @@ import { ProjectRelationRepository, ProjectRepository } from '@n8n/db';
 import { Container } from '@n8n/di';
 import type { Response } from 'express';

+import { BadRequestError } from '@/errors/response-errors/bad-request.error';
+import { NotFoundError } from '@/errors/response-errors/not-found.error';
 import { DataTableRepository } from '@/modules/data-table/data-table.repository';
 import { DataTableService } from '@/modules/data-table/data-table.service';
 import { DataTableNotFoundError } from '@/modules/data-table/errors/data-table-not-found.error';
@ -187,7 +189,7 @@ describe('DataTable Handler', () => {
 			);
 		});

-		it('should return 404 when data table not found', async () => {
+		it('should throw NotFoundError when data table not found', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId },
@ -200,16 +202,23 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.getDataTableRows[3](req, mockResponse as Response);
+			const handlerFn = handler.getDataTableRows[3];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining(dataTableId),
+				httpStatusCode: 404,
 			});
 		});

-		it('should return 400 for validation errors', async () => {
+		it('should throw BadRequestError for validation errors', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId },
@ -218,12 +227,19 @@ describe('DataTable Handler', () => {
 			} as unknown as DataTableRequest.GetRows;

 			// Act
-			await handler.getDataTableRows[3](req, mockResponse as Response);
+			const handlerFn = handler.getDataTableRows[3];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(400);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(BadRequestError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining('Invalid'),
+				httpStatusCode: 400,
 			});
 		});
 	});
@ -302,7 +318,7 @@ describe('DataTable Handler', () => {
 			expect(mockResponse.json).toHaveBeenCalledWith([mockRow]);
 		});

-		it('should return 404 when data table not found', async () => {
+		it('should throw NotFoundError when data table not found', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId },
@ -315,10 +331,17 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.insertDataTableRows[2](req, mockResponse as Response);
+			const handlerFn = handler.insertDataTableRows[2];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({ httpStatusCode: 404 });
 		});
 	});

@ -544,7 +567,7 @@ describe('DataTable Handler', () => {
 			expect(mockResponse.json).toHaveBeenCalledWith([mockRow]);
 		});

-		it('should return 400 when filter is missing', async () => {
+		it('should throw BadRequestError when filter is missing', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId },
@ -553,12 +576,19 @@ describe('DataTable Handler', () => {
 			} as unknown as DataTableRequest.DeleteRows;

 			// Act
-			await handler.deleteDataTableRows[2](req, mockResponse as Response);
+			const handlerFn = handler.deleteDataTableRows[2];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(400);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(BadRequestError);
+			expect(caught).toMatchObject({
 				message: 'Required',
+				httpStatusCode: 400,
 			});
 		});

@ -598,7 +628,7 @@ describe('DataTable Handler', () => {
 	describe('Security - Cross-Project Access', () => {
 		const otherUserDataTableId = 'other-user-data-table-id';

-		it('should return 404 when trying to get rows from another users data table', async () => {
+		it('should throw NotFoundError when trying to get rows from another users data table', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId: otherUserDataTableId },
@ -617,19 +647,26 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.getDataTableRows[3](req, mockResponse as Response);
+			const handlerFn = handler.getDataTableRows[3];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
 			expect(mockDataTableService.getProjectIdForDataTable).toHaveBeenCalledWith(
 				otherUserDataTableId,
 			);
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining(otherUserDataTableId),
+				httpStatusCode: 404,
 			});
 		});

-		it('should return 404 when trying to insert rows into another users data table', async () => {
+		it('should throw NotFoundError when trying to insert rows into another users data table', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId: otherUserDataTableId },
@ -649,16 +686,23 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.insertDataTableRows[2](req, mockResponse as Response);
+			const handlerFn = handler.insertDataTableRows[2];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining(otherUserDataTableId),
+				httpStatusCode: 404,
 			});
 		});

-		it('should return 404 when trying to update rows in another users data table', async () => {
+		it('should throw NotFoundError when trying to update rows in another users data table', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId: otherUserDataTableId },
@ -680,16 +724,23 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.updateDataTableRows[2](req, mockResponse as Response);
+			const handlerFn = handler.updateDataTableRows[2];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining(otherUserDataTableId),
+				httpStatusCode: 404,
 			});
 		});

-		it('should return 404 when trying to upsert row in another users data table', async () => {
+		it('should throw NotFoundError when trying to upsert row in another users data table', async () => {
 			// Arrange
 			const req = {
 				params: { dataTableId: otherUserDataTableId },
@ -714,16 +765,23 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.upsertDataTableRow[2](req, mockResponse as Response);
+			const handlerFn = handler.upsertDataTableRow[2];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining(otherUserDataTableId),
+				httpStatusCode: 404,
 			});
 		});

-		it('should return 404 when trying to delete rows from another users data table', async () => {
+		it('should throw NotFoundError when trying to delete rows from another users data table', async () => {
 			// Arrange
 			const filterStr = JSON.stringify({
 				type: 'and',
@ -748,12 +806,19 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.deleteDataTableRows[2](req, mockResponse as Response);
+			const handlerFn = handler.deleteDataTableRows[2];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining(otherUserDataTableId),
+				httpStatusCode: 404,
 			});
 		});

@ -775,16 +840,23 @@ describe('DataTable Handler', () => {
 			);

 			// Act
-			await handler.getDataTableRows[3](req, mockResponse as Response);
+			const handlerFn = handler.getDataTableRows[3];
+			let caught: unknown;
+			try {
+				await handlerFn(req, mockResponse as Response);
+			} catch (error) {
+				caught = error;
+			}

 			// Assert
 			// The error message should be the same whether:
 			// 1. The table doesn't exist at all
 			// 2. The table exists but belongs to another user's project
 			// This prevents information leakage
-			expect(mockResponse.status).toHaveBeenCalledWith(404);
-			expect(mockResponse.json).toHaveBeenCalledWith({
+			expect(caught).toBeInstanceOf(NotFoundError);
+			expect(caught).toMatchObject({
 				message: expect.stringContaining(nonExistentDataTableId),
+				httpStatusCode: 404,
 			});
 		});
 	});
--- a/packages/cli/src/public-api/v1/handlers/data-tables/data-tables.columns.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/data-tables/data-tables.columns.handler.ts
@ -1,6 +1,5 @@
 import { AddDataTableColumnDto, updateDataTableColumnSchema } from '@n8n/api-types';
 import { Container } from '@n8n/di';
-import type express from 'express';

 import { BadRequestError } from '@/errors/response-errors/bad-request.error';
 import { ConflictError } from '@/errors/response-errors/conflict.error';
@ -9,13 +8,32 @@ import { DataTableColumnNameConflictError } from '@/modules/data-table/errors/da
 import { DataTableSystemColumnNameConflictError } from '@/modules/data-table/errors/data-table-system-column-name-conflict.error';

 import type { DataTableRequest } from '../../../types';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import { projectScope, publicApiScope } from '../../shared/middlewares/global.middleware';

-export = {
+const handleError = (error: unknown) => {
+	if (
+		error instanceof DataTableColumnNameConflictError ||
+		error instanceof DataTableSystemColumnNameConflictError
+	) {
+		throw new ConflictError(error.message);
+	}
+
+	throw error;
+};
+
+type DataTableColumnsHandlers = {
+	listDataTableColumns: PublicAPIEndpoint<DataTableRequest.ListColumns>;
+	createDataTableColumn: PublicAPIEndpoint<DataTableRequest.CreateColumn>;
+	deleteDataTableColumn: PublicAPIEndpoint<DataTableRequest.DeleteColumn>;
+	updateDataTableColumn: PublicAPIEndpoint<DataTableRequest.UpdateColumn>;
+};
+
+const dataTableColumnsHandlers: DataTableColumnsHandlers = {
 	listDataTableColumns: [
 		publicApiScope('dataTableColumn:read'),
 		projectScope('dataTable:readColumn', 'dataTable'),
-		async (req: DataTableRequest.ListColumns, res: express.Response) => {
+		async (req, res) => {
 			const { dataTableId } = req.params;
 			const projectId = await Container.get(DataTableService).getProjectIdForDataTable(dataTableId);
 			return res.json(await Container.get(DataTableService).getColumns(dataTableId, projectId));
@ -25,7 +43,7 @@ export = {
 	createDataTableColumn: [
 		publicApiScope('dataTableColumn:create'),
 		projectScope('dataTable:writeColumn', 'dataTable'),
-		async (req: DataTableRequest.CreateColumn, res: express.Response) => {
+		async (req, res) => {
 			const { dataTableId } = req.params;
 			const payload = AddDataTableColumnDto.safeParse(req.body);
 			if (!payload.success) {
@ -41,13 +59,7 @@ export = {
 				);
 				return res.status(201).json(column);
 			} catch (error) {
-				if (
-					error instanceof DataTableColumnNameConflictError ||
-					error instanceof DataTableSystemColumnNameConflictError
-				) {
-					throw new ConflictError(error.message);
-				}
-				throw error;
+				return handleError(error);
 			}
 		},
 	],
@ -55,7 +67,7 @@ export = {
 	deleteDataTableColumn: [
 		publicApiScope('dataTableColumn:delete'),
 		projectScope('dataTable:writeColumn', 'dataTable'),
-		async (req: DataTableRequest.DeleteColumn, res: express.Response) => {
+		async (req, res) => {
 			const { dataTableId, columnId } = req.params;
 			const projectId = await Container.get(DataTableService).getProjectIdForDataTable(dataTableId);
 			await Container.get(DataTableService).deleteColumn(dataTableId, projectId, columnId);
@ -66,7 +78,7 @@ export = {
 	updateDataTableColumn: [
 		publicApiScope('dataTableColumn:update'),
 		projectScope('dataTable:writeColumn', 'dataTable'),
-		async (req: DataTableRequest.UpdateColumn, res: express.Response) => {
+		async (req, res) => {
 			try {
 				const { dataTableId, columnId } = req.params;
 				const payload = updateDataTableColumnSchema.safeParse(req.body);
@ -89,14 +101,10 @@ export = {
 				const updatedColumn = await service.getColumnById({ projectId, dataTableId, columnId });
 				return res.json(updatedColumn);
 			} catch (error) {
-				if (
-					error instanceof DataTableColumnNameConflictError ||
-					error instanceof DataTableSystemColumnNameConflictError
-				) {
-					throw new ConflictError(error.message);
-				}
-				throw error;
+				return handleError(error);
 			}
 		},
 	],
 };
+
+export = dataTableColumnsHandlers;
--- a/packages/cli/src/public-api/v1/handlers/data-tables/data-tables.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/data-tables/data-tables.handler.ts
@ -4,10 +4,11 @@ import {
 	UpdateDataTableDto,
 } from '@n8n/api-types';
 import { Container } from '@n8n/di';
-import type express from 'express';

 import { BadRequestError } from '@/errors/response-errors/bad-request.error';
+import { ConflictError } from '@/errors/response-errors/conflict.error';
 import { ForbiddenError } from '@/errors/response-errors/forbidden.error';
+import { NotFoundError } from '@/errors/response-errors/not-found.error';
 import { DataTableRepository } from '@/modules/data-table/data-table.repository';
 import { DataTableService } from '@/modules/data-table/data-table.service';
 import { DataTableNameConflictError } from '@/modules/data-table/errors/data-table-name-conflict.error';
@ -17,6 +18,7 @@ import { ProjectService } from '@/services/project.service.ee';

 import { getDataTableListFilter, resolveProjectIdForCreate } from './data-tables.service';
 import type { DataTableRequest } from '../../../types';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import {
 	publicApiScope,
 	projectScope,
@ -24,22 +26,20 @@ import {
 } from '../../shared/middlewares/global.middleware';
 import { encodeNextCursor } from '../../shared/services/pagination.service';

-const handleError = (error: unknown, res: express.Response): express.Response => {
-	if (error instanceof DataTableNotFoundError) {
-		return res.status(404).json({ message: error.message });
-	}
-	if (error instanceof DataTableNameConflictError) {
-		return res.status(409).json({ message: error.message });
-	}
+const handleError = (error: unknown) => {
 	if (error instanceof DataTableValidationError) {
-		return res.status(400).json({ message: error.message });
+		throw new BadRequestError(error.message);
+	}
+	if (error instanceof DataTableNotFoundError) {
+		throw new NotFoundError(error.message);
 	}
 	if (error instanceof ForbiddenError) {
-		return res.status(error.httpStatusCode).json({ message: error.message });
+		throw new ForbiddenError(error.message);
 	}
-	if (error instanceof Error) {
-		return res.status(400).json({ message: error.message });
+	if (error instanceof DataTableNameConflictError) {
+		throw new ConflictError(error.message);
 	}
+
 	throw error;
 };

@ -57,17 +57,23 @@ const stringifyQuery = (query: Record<string, unknown>): Record<string, string |
 	return result;
 };

-export = {
+type DataTableHandlers = {
+	listDataTables: PublicAPIEndpoint<DataTableRequest.List>;
+	createDataTable: PublicAPIEndpoint<DataTableRequest.Create>;
+	getDataTable: PublicAPIEndpoint<DataTableRequest.Get>;
+	updateDataTable: PublicAPIEndpoint<DataTableRequest.Update>;
+	deleteDataTable: PublicAPIEndpoint<DataTableRequest.Delete>;
+};
+
+const dataTableHandlers: DataTableHandlers = {
 	listDataTables: [
 		publicApiScope('dataTable:list'),
 		validCursor,
-		async (req: DataTableRequest.List, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const payload = PublicApiListDataTableQueryDto.safeParse(stringifyQuery(req.query));
 				if (!payload.success) {
-					return res.status(400).json({
-						message: payload.error.errors[0]?.message || 'Invalid query parameters',
-					});
+					throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid query parameters');
 				}

 				const { offset, limit, filter, sortBy } = payload.data;
@ -111,14 +117,14 @@ export = {
 					}),
 				});
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],

 	createDataTable: [
 		publicApiScope('dataTable:create'),
-		async (req: DataTableRequest.Create, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const payload = PublicApiCreateDataTableDto.safeParse(req.body);
 			if (!payload.success) {
 				throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid request body');
@ -135,7 +141,7 @@ export = {

 				return res.status(201).json(dataTable);
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
@ -143,7 +149,7 @@ export = {
 	getDataTable: [
 		publicApiScope('dataTable:read'),
 		projectScope('dataTable:read', 'dataTable'),
-		async (req: DataTableRequest.Get, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

@ -163,7 +169,7 @@ export = {

 				return res.json(dataTable);
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
@ -171,15 +177,13 @@ export = {
 	updateDataTable: [
 		publicApiScope('dataTable:update'),
 		projectScope('dataTable:update', 'dataTable'),
-		async (req: DataTableRequest.Update, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

 				const payload = UpdateDataTableDto.safeParse(req.body);
 				if (!payload.success) {
-					return res.status(400).json({
-						message: payload.error.errors[0]?.message || 'Invalid request body',
-					});
+					throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid request body');
 				}

 				const projectId =
@ -200,7 +204,7 @@ export = {

 				return res.json(dataTable);
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
@ -208,7 +212,7 @@ export = {
 	deleteDataTable: [
 		publicApiScope('dataTable:delete'),
 		projectScope('dataTable:delete', 'dataTable'),
-		async (req: DataTableRequest.Delete, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

@ -219,8 +223,10 @@ export = {

 				return res.status(204).send();
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
 };
+
+export = dataTableHandlers;
--- a/packages/cli/src/public-api/v1/handlers/data-tables/data-tables.rows.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/data-tables/data-tables.rows.handler.ts
@ -6,13 +6,15 @@ import {
 	DeleteDataTableRowsDto,
 } from '@n8n/api-types';
 import { Container } from '@n8n/di';
-import type express from 'express';

+import { BadRequestError } from '@/errors/response-errors/bad-request.error';
+import { NotFoundError } from '@/errors/response-errors/not-found.error';
 import { DataTableService } from '@/modules/data-table/data-table.service';
 import { DataTableNotFoundError } from '@/modules/data-table/errors/data-table-not-found.error';
 import { DataTableValidationError } from '@/modules/data-table/errors/data-table-validation.error';

 import type { DataTableRequest } from '../../../types';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import {
 	publicApiScope,
 	projectScope,
@ -20,16 +22,14 @@ import {
 } from '../../shared/middlewares/global.middleware';
 import { encodeNextCursor } from '../../shared/services/pagination.service';

-const handleError = (error: unknown, res: express.Response): express.Response => {
+const handleError = (error: unknown) => {
 	if (error instanceof DataTableNotFoundError) {
-		return res.status(404).json({ message: error.message });
+		throw new NotFoundError(error.message);
 	}
 	if (error instanceof DataTableValidationError) {
-		return res.status(400).json({ message: error.message });
-	}
-	if (error instanceof Error) {
-		return res.status(400).json({ message: error.message });
+		throw new BadRequestError(error.message);
 	}
+
 	throw error;
 };

@ -47,20 +47,26 @@ const stringifyQuery = (query: Record<string, unknown>): Record<string, string |
 	return result;
 };

-export = {
+type DataTableRowsHandlers = {
+	getDataTableRows: PublicAPIEndpoint<DataTableRequest.GetRows>;
+	insertDataTableRows: PublicAPIEndpoint<DataTableRequest.InsertRows>;
+	updateDataTableRows: PublicAPIEndpoint<DataTableRequest.UpdateRows>;
+	upsertDataTableRow: PublicAPIEndpoint<DataTableRequest.UpsertRow>;
+	deleteDataTableRows: PublicAPIEndpoint<DataTableRequest.DeleteRows>;
+};
+
+const dataTableRowsHandlers: DataTableRowsHandlers = {
 	getDataTableRows: [
 		publicApiScope('dataTableRow:read'),
 		projectScope('dataTable:readRow', 'dataTable'),
 		validCursor,
-		async (req: DataTableRequest.GetRows, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

 				const payload = PublicApiListDataTableContentQueryDto.safeParse(stringifyQuery(req.query));
 				if (!payload.success) {
-					return res.status(400).json({
-						message: payload.error.errors[0]?.message || 'Invalid query parameters',
-					});
+					throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid query parameters');
 				}

 				const { offset, limit, filter, sortBy, search } = payload.data;
@ -89,7 +95,7 @@ export = {
 					}),
 				});
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
@ -97,15 +103,13 @@ export = {
 	insertDataTableRows: [
 		publicApiScope('dataTableRow:create'),
 		projectScope('dataTable:writeRow', 'dataTable'),
-		async (req: DataTableRequest.InsertRows, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

 				const payload = AddDataTableRowsDto.safeParse(req.body);
 				if (!payload.success) {
-					return res.status(400).json({
-						message: payload.error.errors[0]?.message || 'Invalid request body',
-					});
+					throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid request body');
 				}

 				const projectId =
@ -120,7 +124,7 @@ export = {

 				return res.json(result);
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
@ -128,15 +132,13 @@ export = {
 	updateDataTableRows: [
 		publicApiScope('dataTableRow:update'),
 		projectScope('dataTable:writeRow', 'dataTable'),
-		async (req: DataTableRequest.UpdateRows, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

 				const payload = UpdateDataTableRowDto.safeParse(req.body);
 				if (!payload.success) {
-					return res.status(400).json({
-						message: payload.error.errors[0]?.message || 'Invalid request body',
-					});
+					throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid request body');
 				}

 				const projectId =
@ -153,7 +155,7 @@ export = {

 				return res.json(result);
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
@ -161,15 +163,13 @@ export = {
 	upsertDataTableRow: [
 		publicApiScope('dataTableRow:upsert'),
 		projectScope('dataTable:writeRow', 'dataTable'),
-		async (req: DataTableRequest.UpsertRow, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

 				const payload = UpsertDataTableRowDto.safeParse(req.body);
 				if (!payload.success) {
-					return res.status(400).json({
-						message: payload.error.errors[0]?.message || 'Invalid request body',
-					});
+					throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid request body');
 				}

 				const projectId =
@ -186,7 +186,7 @@ export = {

 				return res.json(result);
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
@ -194,15 +194,13 @@ export = {
 	deleteDataTableRows: [
 		publicApiScope('dataTableRow:delete'),
 		projectScope('dataTable:writeRow', 'dataTable'),
-		async (req: DataTableRequest.DeleteRows, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			try {
 				const { dataTableId } = req.params;

 				const payload = DeleteDataTableRowsDto.safeParse(stringifyQuery(req.query));
 				if (!payload.success) {
-					return res.status(400).json({
-						message: payload.error.errors[0]?.message || 'Invalid query parameters',
-					});
+					throw new BadRequestError(payload.error.errors[0]?.message || 'Invalid query parameters');
 				}

 				const projectId =
@ -219,8 +217,10 @@ export = {

 				return res.json(result);
 			} catch (error) {
-				return handleError(error, res);
+				return handleError(error);
 			}
 		},
 	],
 };
+
+export = dataTableRowsHandlers;
--- a/packages/cli/src/public-api/v1/handlers/discover/tests/discover.handler.test.ts
+++ b/packages/cli/src/public-api/v1/handlers/discover/tests/discover.handler.test.ts
@ -1,8 +1,10 @@
 import { mockInstance } from '@n8n/backend-test-utils';
+import type { AuthenticatedRequest } from '@n8n/db';
 import { ApiKeyRepository } from '@n8n/db';
 import { Container } from '@n8n/di';
 import type { Response } from 'express';
-import type { AuthenticatedRequest } from '@n8n/db';
+
+import { UnauthenticatedError } from '@/errors/response-errors/unauthenticated.error';

 import * as discoverService from '../discover.service';

@ -24,25 +26,28 @@ describe('Discover Handler', () => {

 		mockResponse = {
 			json: jest.fn().mockReturnThis(),
-			status: jest.fn().mockReturnThis(),
 		};
 	});

-	it('should return 401 when API key header is missing', async () => {
+	it('should throw UnauthenticatedError when API key header is missing', async () => {
 		const req = {
 			headers: {},
 			query: {},
 		} as unknown as AuthenticatedRequest;

 		const handlerFn = handler.getDiscover[0];
-		await handlerFn(req, mockResponse);
-
-		expect(mockResponse.status).toHaveBeenCalledWith(401);
-		expect(mockResponse.json).toHaveBeenCalledWith({ message: 'Unauthorized' });
+		let caught: unknown;
+		try {
+			await handlerFn(req, mockResponse as Response);
+		} catch (error) {
+			caught = error;
+		}
+		expect(caught).toBeInstanceOf(UnauthenticatedError);
+		expect(caught).toMatchObject({ message: 'Unauthorized', httpStatusCode: 401 });
 		expect(mockApiKeyRepository.findOne).not.toHaveBeenCalled();
 	});

-	it('should return 401 when API key not found in DB', async () => {
+	it('should throw UnauthenticatedError when API key not found in DB', async () => {
 		mockApiKeyRepository.findOne.mockResolvedValue(null);

 		const req = {
@ -51,10 +56,14 @@ describe('Discover Handler', () => {
 		} as unknown as AuthenticatedRequest;

 		const handlerFn = handler.getDiscover[0];
-		await handlerFn(req, mockResponse);
-
-		expect(mockResponse.status).toHaveBeenCalledWith(401);
-		expect(mockResponse.json).toHaveBeenCalledWith({ message: 'Unauthorized' });
+		let caught: unknown;
+		try {
+			await handlerFn(req, mockResponse as Response);
+		} catch (error) {
+			caught = error;
+		}
+		expect(caught).toBeInstanceOf(UnauthenticatedError);
+		expect(caught).toMatchObject({ message: 'Unauthorized', httpStatusCode: 401 });
 	});

 	it('should return discover data when API key is valid', async () => {
--- a/packages/cli/src/public-api/v1/handlers/discover/discover.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/discover/discover.handler.ts
@ -1,32 +1,36 @@
-import { ApiKeyRepository } from '@n8n/db';
-import type { AuthenticatedRequest } from '@n8n/db';
+import { ApiKeyRepository, type AuthenticatedRequest } from '@n8n/db';
 import { Container } from '@n8n/di';
-import type express from 'express';
+
+import { UnauthenticatedError } from '@/errors/response-errors/unauthenticated.error';

 import { buildDiscoverResponse } from './discover.service';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';

 const API_KEY_AUDIENCE = 'public-api';

+type GetDiscoverRequest = AuthenticatedRequest<
+	{},
+	{},
+	{},
+	{ include?: string; resource?: string; operation?: string }
+>;
+
 function firstString(value: unknown): string | undefined {
 	if (typeof value === 'string') return value;
 	if (Array.isArray(value) && typeof value[0] === 'string') return value[0];
 	return undefined;
 }

-export = {
+type DiscoverHandlers = {
+	getDiscover: PublicAPIEndpoint<GetDiscoverRequest>;
+};
+
+const discoverHandlers: DiscoverHandlers = {
 	getDiscover: [
-		async (
-			req: AuthenticatedRequest<
-				{},
-				{},
-				{},
-				{ include?: string; resource?: string; operation?: string }
-			>,
-			res: express.Response,
-		): Promise<express.Response> => {
+		async (req, res) => {
 			const apiKey = firstString(req.headers['x-n8n-api-key']);
 			if (!apiKey) {
-				return res.status(401).json({ message: 'Unauthorized' });
+				throw new UnauthenticatedError('Unauthorized');
 			}

 			const apiKeyRecord = await Container.get(ApiKeyRepository).findOne({
@ -35,7 +39,7 @@ export = {
 			});

 			if (!apiKeyRecord) {
-				return res.status(401).json({ message: 'Unauthorized' });
+				throw new UnauthenticatedError('Unauthorized');
 			}

 			const includeSchemas = req.query.include === 'schemas';
@ -48,3 +52,5 @@ export = {
 		},
 	],
 };
+
+export = discoverHandlers;
--- a/packages/cli/src/public-api/v1/handlers/executions/executions.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/executions/executions.handler.ts
@ -1,10 +1,9 @@
+import { ExecutionRedactionQueryDtoSchema } from '@n8n/api-types';
 import type { IExecutionBase } from '@n8n/db';
 import { ExecutionRepository } from '@n8n/db';
 import { Container } from '@n8n/di';
-import type express from 'express';
 // eslint-disable-next-line n8n-local-rules/misplaced-n8n-typeorm-import
 import { QueryFailedError } from '@n8n/typeorm';
-import { ExecutionRedactionQueryDtoSchema } from '@n8n/api-types';
 import { type ExecutionStatus, replaceCircularReferences } from 'n8n-workflow';

 import { ActiveExecutions } from '@/active-executions';
@ -12,36 +11,60 @@ import { ConcurrencyControlService } from '@/concurrency/concurrency-control.ser
 import { AbortedExecutionRetryError } from '@/errors/aborted-execution-retry.error';
 import { MissingExecutionStopError } from '@/errors/missing-execution-stop.error';
 import { QueuedExecutionRetryError } from '@/errors/queued-execution-retry.error';
+import { BadRequestError } from '@/errors/response-errors/bad-request.error';
+import { ConflictError } from '@/errors/response-errors/conflict.error';
 import { NotFoundError } from '@/errors/response-errors/not-found.error';
-import { ResponseError } from '@/errors/response-errors/abstract/response.error';
 import { EventService } from '@/events/event.service';
 import { ExecutionPersistence } from '@/executions/execution-persistence';
 import type { RedactableExecution } from '@/executions/execution-redaction';
 import { ExecutionRedactionServiceProxy } from '@/executions/execution-redaction-proxy.service';
 import { ExecutionService } from '@/executions/execution.service';

+import { getExecutionTags, mapAnnotationTags, updateExecutionTags } from './executions.service';
+import type { ExecutionRequest } from '../../../types';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';
+import { publicApiScope, validCursor } from '../../shared/middlewares/global.middleware';
+import { encodeNextCursor } from '../../shared/services/pagination.service';
+import { getSharedWorkflowIds } from '../workflows/workflows.service';
+
+const handleError = (error: unknown) => {
+	if (error instanceof QueuedExecutionRetryError || error instanceof AbortedExecutionRetryError) {
+		throw new ConflictError(error.message);
+	}
+	if (error instanceof MissingExecutionStopError) {
+		throw new NotFoundError(error.message);
+	}
+
+	throw error;
+};
+
 function isRedactableExecution(
 	execution: IExecutionBase,
 ): execution is IExecutionBase & RedactableExecution {
 	return 'data' in execution && 'workflowData' in execution;
 }

-import type { ExecutionRequest } from '../../../types';
-import { publicApiScope, validCursor } from '../../shared/middlewares/global.middleware';
-import { encodeNextCursor } from '../../shared/services/pagination.service';
-import { getSharedWorkflowIds } from '../workflows/workflows.service';
-import { getExecutionTags, mapAnnotationTags, updateExecutionTags } from './executions.service';
+type ExecutionHandlers = {
+	deleteExecution: PublicAPIEndpoint<ExecutionRequest.Delete>;
+	getExecution: PublicAPIEndpoint<ExecutionRequest.Get>;
+	getExecutions: PublicAPIEndpoint<ExecutionRequest.GetAll>;
+	retryExecution: PublicAPIEndpoint<ExecutionRequest.Retry>;
+	getExecutionTags: PublicAPIEndpoint<ExecutionRequest.GetTags>;
+	updateExecutionTags: PublicAPIEndpoint<ExecutionRequest.UpdateTags>;
+	stopExecution: PublicAPIEndpoint<ExecutionRequest.Stop>;
+	stopManyExecutions: PublicAPIEndpoint<ExecutionRequest.StopMany>;
+};

-export = {
+const executionHandlers: ExecutionHandlers = {
 	deleteExecution: [
 		publicApiScope('execution:delete'),
-		async (req: ExecutionRequest.Delete, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const sharedWorkflowsIds = await getSharedWorkflowIds(req.user, ['workflow:delete']);

 			// user does not have workflows hence no executions
 			// or the execution they are trying to access belongs to a workflow they do not own
 			if (!sharedWorkflowsIds.length) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			const { id } = req.params;
@ -52,13 +75,11 @@ export = {
 			).getExecutionInWorkflowsForPublicApi(id, sharedWorkflowsIds, false);

 			if (!execution) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			if (execution.status === 'running') {
-				return res.status(400).json({
-					message: 'Cannot delete a running execution',
-				});
+				throw new BadRequestError('Cannot delete a running execution');
 			}

 			if (execution.status === 'new') {
@ -81,13 +102,13 @@ export = {
 	],
 	getExecution: [
 		publicApiScope('execution:read'),
-		async (req: ExecutionRequest.Get, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const sharedWorkflowsIds = await getSharedWorkflowIds(req.user, ['workflow:read']);

 			// user does not have workflows hence no executions
 			// or the execution they are trying to access belongs to a workflow they do not own
 			if (!sharedWorkflowsIds.length) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			const { id } = req.params;
@ -99,7 +120,7 @@ export = {
 			).getExecutionInWorkflowsForPublicApi(id, sharedWorkflowsIds, includeData);

 			if (!execution) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			if (includeData && isRedactableExecution(execution)) {
@ -108,24 +129,12 @@ export = {
 					? redactQuery.data.redactExecutionData
 					: undefined;

-				try {
-					await Container.get(ExecutionRedactionServiceProxy).processExecution(execution, {
-						user: req.user,
-						redactExecutionData,
-						ipAddress: req.ip ?? '',
-						userAgent: req.headers['user-agent'] ?? '',
-					});
-				} catch (error) {
-					if (error instanceof ResponseError) {
-						return res.status(error.httpStatusCode).json({
-							code: error.httpStatusCode,
-							message: error.message,
-							hint: error.hint,
-							meta: 'meta' in error ? error.meta : undefined,
-						});
-					}
-					throw error;
-				}
+				await Container.get(ExecutionRedactionServiceProxy).processExecution(execution, {
+					user: req.user,
+					redactExecutionData,
+					ipAddress: req.ip ?? '',
+					userAgent: req.headers['user-agent'] ?? '',
+				});
 			}

 			Container.get(EventService).emit('user-retrieved-execution', {
@ -139,7 +148,7 @@ export = {
 	getExecutions: [
 		publicApiScope('execution:list'),
 		validCursor,
-		async (req: ExecutionRequest.GetAll, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const {
 				lastId = undefined,
 				limit = 100,
@ -192,27 +201,15 @@ export = {
 					: undefined;

 				const redactableExecutions = executions.filter(isRedactableExecution);
-				try {
-					await Container.get(ExecutionRedactionServiceProxy).processExecutions(
-						redactableExecutions,
-						{
-							user: req.user,
-							redactExecutionData,
-							ipAddress: req.ip ?? '',
-							userAgent: req.headers['user-agent'] ?? '',
-						},
-					);
-				} catch (error) {
-					if (error instanceof ResponseError) {
-						return res.status(error.httpStatusCode).json({
-							code: error.httpStatusCode,
-							message: error.message,
-							hint: error.hint,
-							meta: 'meta' in error ? error.meta : undefined,
-						});
-					}
-					throw error;
-				}
+				await Container.get(ExecutionRedactionServiceProxy).processExecutions(
+					redactableExecutions,
+					{
+						user: req.user,
+						redactExecutionData,
+						ipAddress: req.ip ?? '',
+						userAgent: req.headers['user-agent'] ?? '',
+					},
+				);
 			}

 			Container.get(EventService).emit('user-retrieved-all-executions', {
@ -232,13 +229,13 @@ export = {
 	],
 	retryExecution: [
 		publicApiScope('execution:retry'),
-		async (req: ExecutionRequest.Retry, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const sharedWorkflowsIds = await getSharedWorkflowIds(req.user, ['workflow:read']);

 			// user does not have workflows hence no executions
 			// or the execution they are trying to access belongs to a workflow they do not own
 			if (!sharedWorkflowsIds.length) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			try {
@ -254,27 +251,18 @@ export = {

 				return res.json(replaceCircularReferences(retriedExecution));
 			} catch (error) {
-				if (
-					error instanceof QueuedExecutionRetryError ||
-					error instanceof AbortedExecutionRetryError
-				) {
-					return res.status(409).json({ message: error.message });
-				} else if (error instanceof NotFoundError) {
-					return res.status(404).json({ message: error.message });
-				} else {
-					throw error;
-				}
+				return handleError(error);
 			}
 		},
 	],
 	getExecutionTags: [
 		publicApiScope('executionTags:list'),
-		async (req: ExecutionRequest.GetTags, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const { id } = req.params;
 			const sharedWorkflowsIds = await getSharedWorkflowIds(req.user, ['workflow:read']);

 			if (!sharedWorkflowsIds.length) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			const execution = await Container.get(
@ -282,7 +270,7 @@ export = {
 			).getExecutionInWorkflowsForPublicApi(id, sharedWorkflowsIds, false);

 			if (!execution) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			const tags = await getExecutionTags(id);
@ -292,13 +280,13 @@ export = {
 	],
 	updateExecutionTags: [
 		publicApiScope('executionTags:update'),
-		async (req: ExecutionRequest.UpdateTags, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const { id } = req.params;
 			const newTagIds = req.body.map((tag) => tag.id);
 			const sharedWorkflowsIds = await getSharedWorkflowIds(req.user, ['workflow:update']);

 			if (!sharedWorkflowsIds.length) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			const execution = await Container.get(
@ -306,7 +294,7 @@ export = {
 			).getExecutionInWorkflowsForPublicApi(id, sharedWorkflowsIds, false);

 			if (!execution) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			try {
@ -315,21 +303,21 @@ export = {
 				return res.json(tags);
 			} catch (error) {
 				if (error instanceof QueryFailedError) {
-					return res.status(404).json({ message: 'Some tags not found' });
+					throw new NotFoundError('Some tags not found');
 				}
-				throw error;
+				return handleError(error);
 			}
 		},
 	],
 	stopExecution: [
 		publicApiScope('execution:stop'),
-		async (req: ExecutionRequest.Stop, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const sharedWorkflowsIds = await getSharedWorkflowIds(req.user, ['workflow:execute']);

 			// user does not have workflows hence no executions
 			// or the execution they are trying to access belongs to a workflow they do not own
 			if (!sharedWorkflowsIds.length) {
-				return res.status(404).json({ message: 'Not Found' });
+				throw new NotFoundError('Not Found');
 			}

 			const { id } = req.params;
@ -339,19 +327,13 @@ export = {

 				return res.json(replaceCircularReferences(stopResult));
 			} catch (error) {
-				if (error instanceof MissingExecutionStopError) {
-					return res.status(404).json({ message: 'Not Found' });
-				} else if (error instanceof NotFoundError) {
-					return res.status(404).json({ message: error.message });
-				} else {
-					throw error;
-				}
+				return handleError(error);
 			}
 		},
 	],
 	stopManyExecutions: [
 		publicApiScope('execution:stop'),
-		async (req: ExecutionRequest.StopMany, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const { status: rawStatus, workflowId, startedAfter, startedBefore } = req.body;
 			const status: ExecutionStatus[] = rawStatus.map((x) => (x === 'queued' ? 'new' : x));
 			// Validate that status is provided and not empty
@ -374,7 +356,7 @@ export = {

 			// If workflowId is provided, validate user has access to it
 			if (workflowId && workflowId !== 'all' && !sharedWorkflowsIds.includes(workflowId)) {
-				return res.status(404).json({ message: 'Workflow not found or not accessible' });
+				throw new NotFoundError('Workflow not found or not accessible');
 			}

 			const filter = {
@ -390,3 +372,5 @@ export = {
 		},
 	],
 };
+
+export = executionHandlers;
--- a/packages/cli/src/public-api/v1/handlers/folders/folders.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/folders/folders.handler.ts
@ -4,6 +4,7 @@ import {
 	ListFolderQueryDto,
 	UpdateFolderDto,
 } from '@n8n/api-types';
+import type { AuthenticatedRequest } from '@n8n/db';
 import { Container } from '@n8n/di';
 import { UserError } from 'n8n-workflow';

@ -12,25 +13,30 @@ import { BadRequestError } from '@/errors/response-errors/bad-request.error';
 import { NotFoundError } from '@/errors/response-errors/not-found.error';
 import { FolderService } from '@/services/folder.service';

-import type { PublicAPIHandler } from '../../shared/handler.types';
+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import {
 	apiKeyHasScopeWithGlobalScopeFallback,
 	isLicensed,
 } from '../../shared/middlewares/global.middleware';
 import { assertProjectScope } from '../../shared/services/utils.service';

-type FoldersEndpoint<TParams extends Record<string, string>> = readonly [
-	ReturnType<typeof isLicensed>,
-	ReturnType<typeof apiKeyHasScopeWithGlobalScopeFallback>,
-	PublicAPIHandler<TParams>,
-];
+const handleError = (error: unknown) => {
+	if (error instanceof FolderNotFoundError) {
+		throw new NotFoundError(error.message);
+	}
+	if (error instanceof UserError) {
+		throw new BadRequestError(error.message);
+	}
+
+	throw error;
+};

 type FolderHandlers = {
-	createFolder: FoldersEndpoint<{ projectId: string }>;
-	getFolders: FoldersEndpoint<{ projectId: string }>;
-	deleteFolder: FoldersEndpoint<{ projectId: string; folderId: string }>;
-	getFolder: FoldersEndpoint<{ projectId: string; folderId: string }>;
-	updateFolder: FoldersEndpoint<{ projectId: string; folderId: string }>;
+	createFolder: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string }>>;
+	getFolders: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string }>>;
+	deleteFolder: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string; folderId: string }>>;
+	getFolder: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string; folderId: string }>>;
+	updateFolder: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string; folderId: string }>>;
 };

 const folderHandlers: FolderHandlers = {
@ -49,9 +55,8 @@ const folderHandlers: FolderHandlers = {
 			try {
 				const folder = await Container.get(FolderService).createFolder(payload.data, projectId);
 				return res.status(201).json(folder);
-			} catch (e) {
-				if (e instanceof FolderNotFoundError) throw new NotFoundError(e.message);
-				throw e;
+			} catch (error) {
+				return handleError(error);
 			}
 		},
 	],
@ -89,10 +94,8 @@ const folderHandlers: FolderHandlers = {
 			try {
 				await Container.get(FolderService).deleteFolder(req.user, folderId, projectId, query.data);
 				return res.status(204).send();
-			} catch (e) {
-				if (e instanceof FolderNotFoundError) throw new NotFoundError(e.message);
-				if (e instanceof UserError) throw new BadRequestError(e.message);
-				throw e;
+			} catch (error) {
+				return handleError(error);
 			}
 		},
 	],
@ -109,9 +112,8 @@ const folderHandlers: FolderHandlers = {
 				).findFolderWithContentCounts(req.params.folderId, projectId);

 				return res.json({ ...folder, totalSubFolders, totalWorkflows });
-			} catch (e) {
-				if (e instanceof FolderNotFoundError) throw new NotFoundError(e.message);
-				throw e;
+			} catch (error) {
+				return handleError(error);
 			}
 		},
 	],
@ -134,10 +136,8 @@ const folderHandlers: FolderHandlers = {
 					payload.data,
 				);
 				return res.json(folder);
-			} catch (e) {
-				if (e instanceof FolderNotFoundError) throw new NotFoundError(e.message);
-				if (e instanceof UserError) throw new BadRequestError(e.message);
-				throw e;
+			} catch (error) {
+				return handleError(error);
 			}
 		},
 	],
--- a/packages/cli/src/public-api/v1/handlers/insights/insights.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/insights/insights.handler.ts
@ -1,6 +1,5 @@
 import { InsightsDateFilterDto } from '@n8n/api-types';
 import { Container } from '@n8n/di';
-import type express from 'express';
 import { DateTime } from 'luxon';
 import { UserError } from 'n8n-workflow';
 import { z } from 'zod';
@ -10,8 +9,17 @@ import { ForbiddenError } from '@/errors/response-errors/forbidden.error';
 import { InsightsService } from '@/modules/insights/insights.service';
 import type { InsightsRequest } from '@/public-api/types';

+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import { publicApiScope } from '../../shared/middlewares/global.middleware';

+const handleError = (error: unknown) => {
+	if (error instanceof UserError) {
+		throw new ForbiddenError(error.message);
+	}
+
+	throw error;
+};
+
 const dateFilterValidationSchema = z
 	.object({
 		startDate: z.coerce.date().optional(),
@ -31,10 +39,14 @@ const dateFilterValidationSchema = z
 		},
 	);

-export = {
+type InsightsHandlers = {
+	getInsightsSummary: PublicAPIEndpoint<InsightsRequest.GetSummary>;
+};
+
+const insightsHandlers: InsightsHandlers = {
 	getInsightsSummary: [
 		publicApiScope('insights:read'),
-		async (req: InsightsRequest.GetSummary, res: express.Response): Promise<express.Response> => {
+		async (req, res) => {
 			const query = InsightsDateFilterDto.safeParse(req.query);
 			if (!query.success) {
 				throw new BadRequestError(query.error.errors.map(({ message }) => message).join('; '));
@ -51,11 +63,7 @@ export = {
 			try {
 				Container.get(InsightsService).validateDateFiltersLicense({ startDate, endDate });
 			} catch (error) {
-				if (error instanceof UserError) {
-					throw new ForbiddenError(error.message);
-				}
-
-				throw error;
+				return handleError(error);
 			}

 			const summary = await Container.get(InsightsService).getInsightsSummary({
@ -68,3 +76,5 @@ export = {
 		},
 	],
 };
+
+export = insightsHandlers;
--- a/packages/cli/src/public-api/v1/handlers/projects/projects.handler.ts
+++ b/packages/cli/src/public-api/v1/handlers/projects/projects.handler.ts
@ -9,14 +9,14 @@ import type { AuthenticatedRequest } from '@n8n/db';
 import { ProjectRelationRepository, ProjectRepository } from '@n8n/db';
 import { Container } from '@n8n/di';
 import pick from 'lodash/pick';
-import type { Response } from 'express';

 import { ProjectController } from '@/controllers/project.controller';
+import { BadRequestError } from '@/errors/response-errors/bad-request.error';
 import { NotFoundError } from '@/errors/response-errors/not-found.error';
-import { ResponseError } from '@/errors/response-errors/abstract/response.error';
 import type { PaginatedRequest } from '@/public-api/types';
 import { ProjectService } from '@/services/project.service.ee';

+import type { PublicAPIEndpoint } from '../../shared/handler.types';
 import {
 	apiKeyHasScopeWithGlobalScopeFallback,
 	isLicensed,
@ -25,14 +25,31 @@ import {
 import { encodeNextCursor } from '../../shared/services/pagination.service';

 type GetAll = PaginatedRequest;
-export = {
+type GetProjectUsersRequest = AuthenticatedRequest<{ projectId: string }> & GetAll;
+
+type ProjectHandlers = {
+	createProject: PublicAPIEndpoint<AuthenticatedRequest>;
+	updateProject: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string }>>;
+	deleteProject: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string }>>;
+	getProjects: PublicAPIEndpoint<GetAll>;
+	getProjectUsers: PublicAPIEndpoint<GetProjectUsersRequest>;
+	addUsersToProject: PublicAPIEndpoint<AuthenticatedRequest<{ projectId: string }>>;
+	changeUserRoleInProject: PublicAPIEndpoint<
+		AuthenticatedRequest<{ projectId: string; userId: string }>
+	>;
+	deleteUserFromProject: PublicAPIEndpoint<
+		AuthenticatedRequest<{ projectId: string; userId: string }>
+	>;
+};
+
+const projectHandlers: ProjectHandlers = {
 	createProject: [
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'project:create' }),
-		async (req: AuthenticatedRequest, res: Response) => {
+		async (req, res) => {
 			const payload = CreateProjectDto.safeParse(req.body);
 			if (payload.error) {
-				return res.status(400).json(payload.error.errors[0]);
+				throw new BadRequestError(payload.error.errors[0].message);
 			}

 			const project = await Container.get(ProjectController).createProject(req, res, payload.data);
@ -43,10 +60,10 @@ export = {
 	updateProject: [
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'project:update' }),
-		async (req: AuthenticatedRequest<{ projectId: string }>, res: Response) => {
+		async (req, res) => {
 			const payload = UpdateProjectWithRelationsDto.safeParse(req.body);
 			if (payload.error) {
-				return res.status(400).json(payload.error.errors[0]);
+				throw new BadRequestError(payload.error.errors[0].message);
 			}

 			await Container.get(ProjectController).updateProject(
@ -62,10 +79,10 @@ export = {
 	deleteProject: [
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'project:delete' }),
-		async (req: AuthenticatedRequest<{ projectId: string }>, res: Response) => {
+		async (req, res) => {
 			const query = DeleteProjectDto.safeParse(req.query);
 			if (query.error) {
-				return res.status(400).json(query.error.errors[0]);
+				throw new BadRequestError(query.error.errors[0].message);
 			}

 			await Container.get(ProjectController).deleteProject(
@ -82,7 +99,7 @@ export = {
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'project:list' }),
 		validCursor,
-		async (req: GetAll, res: Response) => {
+		async (req, res) => {
 			const { offset = 0, limit = 100 } = req.query;

 			const [projects, count] = await Container.get(ProjectRepository).findAndCount({
@ -104,77 +121,63 @@ export = {
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'user:list' }),
 		validCursor,
-		async (req: AuthenticatedRequest<{ projectId: string }> & GetAll, res: Response) => {
+		async (req, res) => {
 			const { projectId } = req.params;
 			const offset = Number(req.query.offset) || 0;
 			const limit = Number(req.query.limit) || 100;

-			try {
-				const projectService = Container.get(ProjectService);
-				const project = await projectService.getProjectWithScope(req.user, projectId, [
-					'project:list',
-				]);
-				if (!project) {
-					throw new NotFoundError(`Could not find project with ID "${projectId}"`);
-				}
-
-				const projectRelationRepository = Container.get(ProjectRelationRepository);
-				const [relations, count] = await projectRelationRepository.findAndCount({
-					where: { projectId },
-					relations: { user: true, role: true },
-					skip: offset,
-					take: limit,
-				});
-
-				const memberFields = [
-					'id',
-					'email',
-					'firstName',
-					'lastName',
-					'createdAt',
-					'updatedAt',
-				] as const;
-				const data = relations.map((relation) => ({
-					...pick(relation.user, memberFields),
-					role: relation.role?.slug ?? null,
-				}));
-
-				return res.json({
-					data,
-					nextCursor: encodeNextCursor({
-						offset,
-						limit,
-						numberOfTotalRecords: count,
-					}),
-				});
-			} catch (error) {
-				if (error instanceof ResponseError) {
-					return res.status(error.httpStatusCode).json({ message: error.message });
-				}
-				throw error;
+			const projectService = Container.get(ProjectService);
+			const project = await projectService.getProjectWithScope(req.user, projectId, [
+				'project:list',
+			]);
+			if (!project) {
+				throw new NotFoundError(`Could not find project with ID "${projectId}"`);
 			}
+
+			const projectRelationRepository = Container.get(ProjectRelationRepository);
+			const [relations, count] = await projectRelationRepository.findAndCount({
+				where: { projectId },
+				relations: { user: true, role: true },
+				skip: offset,
+				take: limit,
+			});
+
+			const memberFields = [
+				'id',
+				'email',
+				'firstName',
+				'lastName',
+				'createdAt',
+				'updatedAt',
+			] as const;
+			const data = relations.map((relation) => ({
+				...pick(relation.user, memberFields),
+				role: relation.role?.slug ?? null,
+			}));
+
+			return res.json({
+				data,
+				nextCursor: encodeNextCursor({
+					offset,
+					limit,
+					numberOfTotalRecords: count,
+				}),
+			});
 		},
 	],
 	addUsersToProject: [
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'project:update' }),
-		async (req: AuthenticatedRequest<{ projectId: string }>, res: Response) => {
+		async (req, res) => {
 			const payload = AddUsersToProjectDto.safeParse(req.body);
 			if (payload.error) {
-				return res.status(400).json(payload.error.errors[0]);
+				throw new BadRequestError(payload.error.errors[0].message);
 			}

-			try {
-				await Container.get(ProjectService).addUsersToProject(
-					req.params.projectId,
-					payload.data.relations,
-				);
-			} catch (error) {
-				if (error instanceof ResponseError) {
-					return res.status(error.httpStatusCode).send({ message: error.message });
-				}
-				throw error;
-			}
+			await Container.get(ProjectService).addUsersToProject(
+				req.params.projectId,
+				payload.data.relations,
+			);

 			return res.status(201).send();
 		},
@ -182,22 +185,15 @@ export = {
 	changeUserRoleInProject: [
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'project:update' }),
-		async (req: AuthenticatedRequest<{ projectId: string; userId: string }>, res: Response) => {
+		async (req, res) => {
 			const payload = ChangeUserRoleInProject.safeParse(req.body);
 			if (payload.error) {
-				return res.status(400).json(payload.error.errors[0]);
+				throw new BadRequestError(payload.error.errors[0].message);
 			}

 			const { projectId, userId } = req.params;
 			const { role } = payload.data;
-			try {
-				await Container.get(ProjectService).changeUserRoleInProject(projectId, userId, role);
-			} catch (error) {
-				if (error instanceof ResponseError) {
-					return res.status(error.httpStatusCode).send({ message: error.message });
-				}
-				throw error;
-			}
+			await Container.get(ProjectService).changeUserRoleInProject(projectId, userId, role);

 			return res.status(204).send();
 		},
@ -205,17 +201,14 @@ export = {
 	deleteUserFromProject: [
 		isLicensed('feat:projectRole:admin'),
 		apiKeyHasScopeWithGlobalScopeFallback({ scope: 'project:update' }),
-		async (req: AuthenticatedRequest<{ projectId: string; userId: string }>, res: Response) => {
+		async (req, res) => {
 			const { projectId, userId } = req.params;
-			try {
-				await Container.get(ProjectService).deleteUserFromProject(projectId, userId);
-			} catch (error) {
-				if (error instanceof ResponseError) {
-					return res.status(error.httpStatusCode).send({ message: error.message });
-				}
-				throw error;
-			}
+
+			await Container.get(ProjectService).deleteUserFromProject(projectId, userId);
+
 			return res.status(204).send();
 		},
 	],
 };
+
+export = projectHandlers;
--- a/Show More
+++ b/Show More