Merge branch 'master' into aalises-simplify-handoff

# Conflicts: # packages/@n8n/instance-ai/src/tools/orchestration/__tests__/build-workflow-agent.tool.test.ts # packages/@n8n/instance-ai/src/tools/orchestration/build-workflow-agent.tool.ts # packages/cli/src/modules/instance-ai/instance-ai.service.ts
2026-05-12 16:10:30 +02:00 · 2026-05-05 21:49:46 +02:00 · 2026-05-05 21:49:46 +02:00 · 26f4066215
commit 26f4066215
parent 5632d1098e 82354742d3
671 changed files with 39452 additions and 12069 deletions
--- a/.agents/skills
+++ b/.agents/skills
@ -0,0 +1 @@
+../.claude/plugins/n8n/skills
--- a/.claude/plugins/n8n/skills/community-pr-review/SKILL.md
+++ b/.claude/plugins/n8n/skills/community-pr-review/SKILL.md
@ -0,0 +1,150 @@
+---
+description: >-
+  Checks if a community pull request is ready for human review. Verifies CLA
+  signature, PR title format, description completeness, test coverage, and
+  cubic-dev-ai issues. Use when given a PR number or branch name to review,
+  or when the user says /community-pr-review, /pr-review, or asks to check if
+  a PR is ready for review.
+allowed-tools: Bash(gh:*), Bash(git:*), Read, Glob, Grep
+---
+
+# Community PR Review
+
+Given a PR number or branch name, determine whether it is ready for human review.
+
+## Steps
+
+### 1. Resolve the PR
+
+If given a branch name, find the PR number first:
+```bash
+gh pr view <branch> --repo n8n-io/n8n --json number --jq .number
+```
+
+### 2. Fetch PR data
+
+```bash
+gh pr view <number> --repo n8n-io/n8n \
+  --json number,title,body,author,headRefName,headRefOid,files,isDraft,state
+```
+
+Fetch in parallel:
+
+```bash
+# CLA commit status (primary signal) — statuses are newest-first; use the first returned entry
+gh api --paginate "repos/n8n-io/n8n/commits/<headRefOid>/statuses" \
+  --jq '[.[] | select(.context == "license/cla") | {state, description}] | first'
+
+# CLAassistant issue comment (fallback when no commit status) — use the last returned entry
+gh api --paginate "repos/n8n-io/n8n/issues/<number>/comments" \
+  --jq '[.[] | select(.user.login == "CLAassistant") | .body] | last'
+
+# cubic-dev-ai PR review comments (streamed so results concatenate cleanly across pages)
+gh api --paginate "repos/n8n-io/n8n/pulls/<number>/comments" \
+  --jq '.[] | select(.user.login == "cubic-dev-ai[bot]") | {body: .body, path: .path}'
+```
+
+### 3. Run the five checks
+
+#### A. CLA signed
+
+Check the `license/cla` commit status first; fall back to the CLAassistant comment if no status exists.
+
+**Commit status** (`context == "license/cla"`):
+- `state: "success"` → ✅ signed
+- `state: "failure"` or `state: "error"` → ❌ not signed
+- `state: "pending"` → ⏳ pending
+- Not present → fall back to comment
+
+**CLAassistant issue comment** (fallback):
+- Body contains `"All committers have signed the CLA."` → ✅ signed
+- Body contains `"not signed"` or a link to sign → ❌ not signed
+- No comment → ❌ treat as not signed
+
+#### B. PR title format
+
+For all types except `revert`, the title must match:
+```
+^(feat|fix|perf|test|docs|refactor|build|ci|chore)(\([a-zA-Z0-9 ]+( Node)?\))?!?: [A-Z].+[^.]$
+```
+
+For `revert` titles, the summary is the original commit header (which starts with a lowercase type), so capitalization is not enforced:
+```
+^revert(\([a-zA-Z0-9 ]+( Node)?\))?!?: .+[^.]$
+```
+
+- Type must be one of: `feat fix perf test docs refactor build ci chore revert`
+- Scope is optional, in parentheses e.g. `(editor)` or `(Slack Node)`
+- Breaking changes: `!` before the colon
+- Summary: starts with capital letter (lowercase allowed for `revert:`), no trailing period
+- No Linear ticket IDs in the title (e.g. `N8N-1234`)
+
+#### C. PR description completeness
+
+1. **Summary** (`## Summary`) — must have non-empty content below the heading (not just the HTML comment).
+2. **Related tickets** (`## Related Linear tickets, Github issues, and Community forum posts`) — acceptable content: a URL (`http`), a GitHub closing keyword (`closes #N`, `fixes #N`, `resolves #N`, etc.), or empty. Only flag if the section heading is missing entirely.
+3. **Checklist** (`## Review / Merge checklist`) — all four items must be present. Unchecked checkboxes are expected for community PRs; do **not** flag them as missing.
+
+#### D. Tests
+
+Skip this check if the PR type (from the title) is `docs`, `ci`, `chore`, or `build`.
+
+Otherwise:
+1. Identify source files changed: non-test files under `packages/` from the `files` list.
+2. If there are source file changes, check out the PR in a temporary worktree:
+
+```bash
+git fetch origin pull/<number>/head:pr/<number>
+git worktree add /tmp/pr-<number>-review pr/<number>
+```
+
+3. Read the changed source files from the worktree to understand whether the changes introduce logic that warrants tests (new functions, bug fixes, behaviour changes, data transformations). Pure config changes, type-only changes, and trivial renames do not require tests.
+4. Look for matching test files (`*.test.ts`, `*.spec.ts`, files inside `__tests__/`) among the changed files.
+5. **Always clean up the worktree**, even if a previous check failed:
+
+```bash
+git worktree remove /tmp/pr-<number>-review --force
+git branch -D pr/<number>
+```
+
+Report:
+- ✅ Tests present, or change does not require tests
+- ❌ Source logic changed but no test files found
+
+#### E. cubic-dev-ai issues
+
+Review the PR review comments fetched in step 2. `cubic-dev-ai[bot]` leaves comments for every issue it finds.
+
+- No comments from `cubic-dev-ai[bot]`, or every comment explicitly states no issues were found → ✅
+- Any other comment → ❌ report the total count and priority breakdown (e.g. "3 issues: 1× P1, 1× P2, 1× P3")
+
+### 4. Output
+
+Always output valid JSON in this exact shape:
+
+```json
+{
+  "readyForReview": <true if all passing checks allow merge, false otherwise>,
+  "messageForUser": "<Human-readable summary of what needs to change, written as if posted directly to the PR contributor. 'N/A' if nothing is needed.>",
+  "checks": {
+    "CLA": <true if signed, false if not signed or pending>,
+    "Title": <true if title matches convention, false otherwise>,
+    "Description": <true if all three template sections are complete, false otherwise>,
+    "TestsNeeded": <true if the code changes require tests, false if not applicable>,
+    "TestsIncluded": <true if test files are present in the PR, false otherwise>,
+    "CubicIssues": <true if cubic-dev-ai raised issues, false if no issues>
+  }
+}
+```
+
+`readyForReview` is `true` only when: `CLA`, `Title`, and `Description` are all `true`; `CubicIssues` is `false`; and either `TestsNeeded` is `false` or `TestsIncluded` is `true`.
+
+`messageForUser` should be a short, friendly message directed at the contributor listing exactly what they need to address. If `readyForReview` is `true`, set it to `"N/A"`.
+
+Output nothing other than the JSON block.
+
+## Notes
+
+- Draft PRs — report all findings but note the PR is a draft.
+- If the PR is already merged or closed, say so and skip the checks.
+- Always remove the worktree even if earlier checks failed.
--- a/.claude/plugins/n8n/skills/content-design/SKILL.md
+++ b/.claude/plugins/n8n/skills/content-design/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:content-design
 description: >
  Product content designer for UI copy. Use when writing, reviewing, or auditing
  user-facing text: button labels, error messages, tooltips, empty states, modal copy,
--- a/.claude/plugins/n8n/skills/conventions/SKILL.md
+++ b/.claude/plugins/n8n/skills/conventions/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:conventions
 description: Quick reference for n8n patterns. Full docs /AGENTS.md
 ---

--- a/.claude/plugins/n8n/skills/create-community-node-lint-rule/SKILL.md
+++ b/.claude/plugins/n8n/skills/create-community-node-lint-rule/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:create-community-node-lint-rule
 description: >-
  Create new ESLint rules for the @n8n/eslint-plugin-community-nodes package.
  Use when adding a lint rule, creating a community node lint, or working on
--- a/.claude/plugins/n8n/skills/create-issue/SKILL.md
+++ b/.claude/plugins/n8n/skills/create-issue/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:create-issue
 description: Create Linear tickets or GitHub issues following n8n conventions. Use when the user asks to create a ticket, file a bug, open an issue, or says /create-issue.
 argument-hint: "[linear|github] <description of the issue>"
 compatibility:
--- a/.claude/plugins/n8n/skills/create-pr/SKILL.md
+++ b/.claude/plugins/n8n/skills/create-pr/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:create-pr
 description: Creates GitHub pull requests with properly formatted titles that pass the check-pr-title CI validation. Use when creating PRs, submitting changes for review, or when the user says /pr or asks to create a pull request.
 allowed-tools: Bash(git:*), Bash(gh:*), Read, Grep, Glob
 ---
--- a/.claude/plugins/n8n/skills/create-skill/SKILL.md
+++ b/.claude/plugins/n8n/skills/create-skill/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:create-skill
 description: >-
  Guides users through creating effective Agent Skills. Use when you want to
  create, write, or author a new skill, or asks about skill structure, best
--- a/.claude/plugins/n8n/skills/design-system-rules/SKILL.md
+++ b/.claude/plugins/n8n/skills/design-system-rules/SKILL.md
@ -1,62 +0,0 @@
-# Design System Style Review Rules
-
-Use these rules when working in `packages/frontend/` and `packages/frontend/@n8n/design-system/`.
-Always follow guidance in `packages/frontend/@n8n/design-system/src/styleguide/*.mdx`.
-
-## 1) Token source priority
-
-Prefer this order when choosing visual values:
-
-1. Semantic tokens from
-   `packages/frontend/@n8n/design-system/src/css/_tokens.scss`
-2. Primitives from
-   `packages/frontend/@n8n/design-system/src/css/_primitives.scss`
-3. Hard-coded values only when no suitable token exists
-
-If no token exists, request a short rationale in the PR.
-
-## 2) Hard-coded visual values
-
-Flag hard-coded visual values and suggest token alternatives. This includes:
-
- Colors (`#fff`, `rgb()`, `hsl()`, `oklch()`)
- Spacing and sizing (`px`, `rem`, numeric layout constants in styles)
- Radius, border widths/styles, and shadows
- Typography values (font size, weight, line-height)
- Motion values (durations and easing like `cubic-bezier(...)`)
-
-Severity: strong warning (expected migration to tokens/primitives when possible).
-
-## 3) Legacy token usage
-
-In `_tokens.scss`, the compatibility section labeled
-"Legacy tokens (kept for compatibility)" is considered legacy usage.
-
-When new or modified code uses these legacy token families, flag it as a
-migration opportunity and recommend semantic token usage where available.
-
-Severity: strong warning (discourage new usage, allow compatibility paths).
-
-## 4) Deprecated style and component surfaces
-
-Flag new usage of deprecated/legacy style surfaces in design-system components,
-for example:
-
- `Button.legacy.scss` and legacy button override classes
- Legacy button variants/types (for example `highlight`, `highlight-fill`)
- Legacy component variants that exist for compatibility (for example legacy
-  tabs variant)
-
-Severity: strong warning (prefer modern semantic variants/components).
-
-## 5) Token substitution changes
-
-If a PR changes one token reference to another (for example
-`--text-color` -> `--text-color--subtle`), flag it as a soft warning.
-
-Ask for intent in the PR description/comment:
-
- Intentional design adjustment, or
- Potential accidental visual regression
-
-Do not treat token substitution as a hard failure by default.
--- a/.claude/plugins/n8n/skills/design-system/SKILL.md
+++ b/.claude/plugins/n8n/skills/design-system/SKILL.md
@ -0,0 +1,33 @@
+---
+name: n8n:design-system
+description: Guidelines on using Design System styles and components. Use when working on .vue files in packages/frontend. Triggers for tasks that include component architecture, styling, UI changes, or feature work.
+---
+
+# Design System
+
+Comprehensive guide for building, styling, and using components in the frontend.
+
+## When to Apply
+Reference these guidelines when:
+- Working on `.{vue|css|scss}` files in `packages/frontend`
+- Adding new components to `packages/frontend/@n8n/design-system`
+- Refactoring styles for Vue components
+- Implementing new UI components or features
+- Reviewing changes to UI
+
+## Rules
+- Follow guidelines in `packages/frontend/@n8n/design-system/src/styleguide/*.mdx`
+- ALWAYS use CSS variables for styles from `packages/frontend/@n8n/design-system/src/css/_tokens.scss` or `packages/frontend/@n8n/design-system/src/css/_primtivies.scss`. Use hard-coded values only when no suitable tokens.
+- ALWAYS prefer using existing components from `packages/frontend/@n8n/design-system/src/components`. Prefer components that aren't marked `@deprecated`.
+- Use `light-dark()` when alternating colors for ligh/dark mode
+- When working with animations or transitions, ALWAYS prefer using mixins from `packages/frontend/@n8n/design-system/src/css/mixins/motion.scss`
+- When reviewing animations, follow the guides in `rules/web-animation-guidelines.md`
+- When reviewing UI changes or adding new components, follow `rules/web-interface-guidelines.md`
+
+## Examples
+- "Add a modal dialog for confirming workflow deletion" → Use `N8nDialog`
+- "Add a dropdown to select workflow status" → Use `N8nDropdown` or `N8nSelect`
+- "Add button with + icon to add new tiem" → Wrap `N8nButton` with `iconOnly` prop with `N8nTooltip` and wrap in `N8nTooltip`. Use `N8nIcon` and proper aria-label.
+- "Add a destructive action button" → use `N8nButton` with `variant="destructive"`
+- "Make background color white/black" → Use `var(--background--surface)` for white on light mode and "black" on dark mode
+- "Animate the title in gracefully" -> Use `fade-in-up` mixin from `motion.scss` with `var(--duration--base)`
--- a/.claude/plugins/n8n/skills/design-system/rules/web-animation-guidelines.md
+++ b/.claude/plugins/n8n/skills/design-system/rules/web-animation-guidelines.md
@ -0,0 +1,93 @@
+# Web Motion Guidelines
+Design and implement web animations that feel natural and purposeful
+
+## Timing and Duration
+
+## Duration Guidelines
+
+| Element Type                      | Duration  |
+| --------------------------------- | --------- |
+| Micro-interactions                | 100-150ms |
+| Standard UI (tooltips, dropdowns) | 150-250ms |
+| Modals, drawers                   | 200-300ms |
+
+**Rules:**
+
+- UI animations should stay under 300ms
+- Larger elements animate slower than smaller ones
+- Exit animations can be ~20% faster than entrance
+- Match duration to distance - longer travel = longer duration
+
+### The Frequency
+
+Determine how often users will see the animation:
+
+- **100+ times/day** → No animation (or drastically reduced)
+- **Occasional use** → Standard animation
+- **Rare/first-time** → Can be more special
+
+**Example:** Raycast never animates because users open it hundreds of times a day.
+
+## When to Animate
+
+**Do animate:**
+
+- Enter/exit transitions for spatial consistency
+- State changes that benefit from visual continuity
+- Responses to user actions (feedback)
+- Rarely-used interactions where delight adds value
+
+**Don't animate:**
+
+- Keyboard-initiated actions
+- Hover effects on frequently-used elements
+- Anything users interact with 100+ times daily
+- When speed matters more than smoothness
+
+## Performance
+
+Prefer animating `transform` and `opacity`. These skip layout and paint stages, running entirely on the GPU.
+
+**Avoid animating:**
+
+- `padding`, `margin`, `height`, `width` (trigger layout)
+- `blur` filters above 20px (expensive, especially Safari)
+- CSS variables in deep component trees
+
+### Optimization Techniques
+
+```css
+/* Force GPU acceleration */
+.animated-element {
+  will-change: transform;
+}
+```
+
+## Practical Tips
+
+Quick reference for common scenarios. See [PRACTICAL-TIPS.md](PRACTICAL-TIPS.md) for detailed implementations.
+
+| Scenario                        | Solution                                        |
+| ------------------------------- | ----------------------------------------------- |
+| Make buttons feel responsive    | Add `transform: scale(0.97)` on `:active`       |
+| Element appears from nowhere    | Start from `scale(0.95)`, not `scale(0)`        |
+| Shaky/jittery animations        | Add `will-change: transform`                    |
+| Hover causes flicker            | Animate child element, not parent               |
+| Popover scales from wrong point | Set `transform-origin` to trigger location      |
+| Sequential tooltips feel slow   | Skip delay/animation after first tooltip        |
+| Small buttons hard to tap       | Use 44px minimum hit area (pseudo-element)      |
+| Something still feels off       | Add subtle blur (under 20px) to mask it         |
+| Hover triggers on mobile        | Use `@media (hover: hover) and (pointer: fine)` |
+
+## Easing Decision Flowchart
+
+Is the element entering or exiting the viewport?
+├── Yes → ease-out
+└── No
+├── Is it moving/morphing on screen?
+│ └── Yes → ease-in-out
+└── Is it a hover change?
+├── Yes → ease
+└── Is it constant motion?
+├── Yes → linear
+└── Default → ease-out
--- a/.claude/plugins/n8n/skills/design-system/rules/web-interface-guidelines.md
+++ b/.claude/plugins/n8n/skills/design-system/rules/web-interface-guidelines.md
@ -0,0 +1,98 @@
+# Web Interface Guidelines
+<!-- credit to https://github.com/raunofreiberg/interfaces -->
+This document outlines a non-exhaustive list of details that make a good (web) interface. It is a living document, periodically updated based on learnings. Some of these may be subjective, but most apply to all websites.
+
+The [WAI-ARIA](https://www.w3.org/TR/wai-aria-1.1/) spec is deliberately not duplicated in this document. However, some accessibility guidelines may be pointed out. Contributions are welcome. Edit [this file](https://github.com/raunofreiberg/interfaces/blob/main/README.md) and submit a pull request.
+
+## Interactivity
+
+- Clicking the input label should focus the input field
+- Inputs should be wrapped with a `<form>` to submit by pressing Enter
+- Inputs should have an appropriate `type` like `password`, `email`, etc
+- Inputs should disable `spellcheck` and `autocomplete` attributes most of the time
+- Inputs should leverage HTML form validation by using the `required` attribute when appropriate
+- Input prefix and suffix decorations, such as icons, should be absolutely positioned on top of the text input with padding, not next to it, and trigger focus on the input
+- Toggles should immediately take effect, not require confirmation
+- Buttons should be disabled after submission to avoid duplicate network requests
+- Interactive elements should disable `user-select` for inner content
+- Decorative elements (glows, gradients) should disable `pointer-events` to not hijack events
+- Interactive elements in a vertical or horizontal list should have no dead areas between each element, instead, increase their `padding`
+
+## Typography
+
+- Fonts should have `-webkit-font-smoothing: antialiased` applied for better legibility
+- Fonts should have `text-rendering: optimizeLegibility` applied for better legibility
+- Fonts should be subset based on the content, alphabet or relevant language(s)
+- Font weight should not change on hover or selected state to prevent layout shift
+- Font weights below 400 should not be used
+- Medium sized headings generally look best with a font weight between 500-600
+- Adjust values fluidly by using CSS [`clamp()`](https://developer.mozilla.org/en-US/docs/Web/CSS/clamp), e.g. `clamp(48px, 5vw, 72px)` for the `font-size` of a heading
+- Where available, tabular figures should be applied with `font-variant-numeric: tabular-nums`, particularly in tables or when layout shifts are undesirable, like in timers
+- Prevent text resizing unexpectedly in landscape mode on iOS with `-webkit-text-size-adjust: 100%`
+
+
+## Motion
+
+- Switching themes should not trigger transitions and animations on elements [^1]
+- Animation duration should not be more than 200ms for interactions to feel immediate
+- Animation values should be proportional to the trigger size:
+  - Don't animate dialog scale in from 0 → 1, fade opacity and scale from ~0.8
+  - Don't scale buttons on press from 1 → 0.8, but ~0.96, ~0.9, or so
+- Actions that are frequent and low in novelty should avoid extraneous animations: [^2]
+  - Opening a right click menu
+  - Deleting or adding items from a list
+  - Hovering trivial buttons
+- Looping animations should pause when not visible on the screen to offload CPU and GPU usage
+- Use `scroll-behavior: smooth` for navigating to in-page anchors, with an appropriate offset
+
+## Touch
+
+- Hover states should not be visible on touch press, use `@media (hover: hover)` [^3]
+- Font size for inputs should not be smaller than 16px to prevent iOS zooming on focus
+- Inputs should not auto focus on touch devices as it will open the keyboard and cover the screen
+- Apply `muted` and `playsinline` to `<video />` tags to auto play on iOS
+- Disable `touch-action` for custom components that implement pan and zoom gestures to prevent interference from native behavior like zooming and scrolling
+- Disable the default iOS tap highlight with `-webkit-tap-highlight-color: rgba(0,0,0,0)`, but always replace it with an appropriate alternative
+
+## Optimizations
+
+- Large `blur()` values for `filter` and `backdrop-filter` may be slow
+- Scaling and blurring filled rectangles will cause banding, use radial gradients instead
+- Sparingly enable GPU rendering with `transform: translateZ(0)` for unperformant animations
+- Toggle `will-change` on unperformant scroll animations for the duration of the animation [^4]
+- Auto-playing too many videos on iOS will choke the device, pause or even unmount off-screen videos
+- Bypass React's render lifecycle with refs for real-time values that can commit to the DOM directly [^5]
+- [Detect and adapt](https://github.com/GoogleChromeLabs/react-adaptive-hooks) to the hardware and network capabilities of the user's device
+
+## Accessibility
+
+- Disabled buttons should not have tooltips, they are not accessible [^6]
+- Focusable elements in a sequential list should be navigable with <kbd>↑</kbd> <kbd>↓</kbd>
+- Focusable elements in a sequential list should be deletable with <kbd>⌘</kbd> <kbd>Backspace</kbd>
+- To open immediately on press, dropdown menus should trigger on `mousedown`, not `click`
+- Use a svg favicon with a style tag that adheres to the system theme based on `prefers-color-scheme`
+- Icon only interactive elements should define an explicit `aria-label`
+- Tooltips triggered by hover should not contain interactive content
+- Images should always be rendered with `<img>` for screen readers and ease of copying from the right click menu
+- Illustrations built with HTML should have an explicit `aria-label` instead of announcing the raw DOM tree to people using screen readers
+- Gradient text should unset the gradient on `::selection` state
+- When using nested menus, use a "prediction cone" to prevent the pointer from accidentally closing the menu when moving across other elements.
+
+
+## Design
+
+- Optimistically update data locally and roll back on server error with feedback
+- Authentication redirects should happen on the server before the client loads to avoid janky URL changes
+- Style the document selection state with `::selection`
+- Display feedback relative to its trigger:
+  - Show a temporary inline checkmark on a successful copy, not a notification
+  - Highlight the relevant input(s) on form error(s)
+- Empty states should prompt to create a new item, with optional templates
+
+[^1]: Switching between dark mode or light mode will trigger transitions on elements that are meant for explicit interactions like hover. We can [disable transitions temporarily](https://paco.me/writing/disable-theme-transitions) to prevent this. For Next.js, use [next-themes](https://github.com/pacocoursey/next-themes) which prevents transitions out of the box.
+[^2]: This is a matter of taste but some interactions just feel better with no motion. For example, the native macOS right click menu only animates out, not in, due to the frequent usage of it.
+[^3]: Most touch devices on press will temporarily flash the hover state, unless explicitly only defined for pointer devices with [`@media (hover: hover)`](https://developer.mozilla.org/en-US/docs/Web/CSS/@media/hover).
+[^4]: Use [`will-change`](https://developer.mozilla.org/en-US/docs/Web/CSS/will-change) as a last resort to improve performance. Pre-emptively throwing it on elements for better performance may have the opposite effect.
+[^5]: This might be controversial but sometimes it can be beneficial to manipulate the DOM directly. For example, instead of relying on React re-rendering on every wheel event, we can track the delta in a ref and update relevant elements directly in the callback.
+[^6]: Disabled buttons do not appear in tab order in the DOM so the tooltip will never be announced for keyboard users and they won't know why the button is disabled.
+[^7]: As of 2023, Safari will not take the border radius of an element into account when defining custom outline styles. [Safari 16.4](https://developer.apple.com/documentation/safari-release-notes/safari-16_4-release-notes) has added support for `outline` following the curve of border radius. However, keep in mind that not everyone updates their OS immediately.
--- a/.claude/plugins/n8n/skills/linear-issue/SKILL.md
+++ b/.claude/plugins/n8n/skills/linear-issue/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:linear-issue
 description: Fetch and analyze Linear issue with all related context. Use when starting work on a Linear ticket, analyzing issues, or gathering context about a Linear issue.
 argument-hint: "[issue-id]"
 compatibility:
--- a/.claude/plugins/n8n/skills/loom-transcript/SKILL.md
+++ b/.claude/plugins/n8n/skills/loom-transcript/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:loom-transcript
 description: Fetch and display the full transcript from a Loom video URL. Use when the user wants to get or read a Loom transcript.
 argument-hint: [loom-url]
 ---
--- a/.claude/plugins/n8n/skills/node-add-oauth/SKILL.md
+++ b/.claude/plugins/n8n/skills/node-add-oauth/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:node-add-oauth
 description: Add OAuth2 credential support to an existing n8n node — creates the credential file, updates the node, adds tests, and keeps the CLI constant in sync. Use when the user says /node-add-oauth.
 argument-hint: "[node-name] [optional: custom-scopes flag or scope list]"
 ---
--- a/.claude/plugins/n8n/skills/protect-endpoints/SKILL.md
+++ b/.claude/plugins/n8n/skills/protect-endpoints/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:protect-endpoints
 description: Applies n8n's RBAC scope decorators to REST endpoints. Use when creating a new @RestController, adding any @Get/@Post/@Put/@Patch/@Delete route to an existing controller, or reviewing endpoint authorization. Every authenticated endpoint must be gated by @ProjectScope or @GlobalScope.
 ---

--- a/.claude/plugins/n8n/skills/reproduce-bug/SKILL.md
+++ b/.claude/plugins/n8n/skills/reproduce-bug/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:reproduce-bug
 description: Reproduce a bug from a Linear ticket with a failing test. Expects the full ticket context (title, description, comments) to be provided as input.
 ---

--- a/.claude/plugins/n8n/skills/setup-mcps/SKILL.md
+++ b/.claude/plugins/n8n/skills/setup-mcps/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:setup-mcps
 description: >-
  Configure MCP servers for n8n development. Use when the user says /setup-mcps
  or asks to set up MCP servers for n8n.
--- a/.claude/plugins/n8n/skills/spec-driven-development/SKILL.md
+++ b/.claude/plugins/n8n/skills/spec-driven-development/SKILL.md
@ -1,4 +1,5 @@
 ---
+name: n8n:spec-driven-development
 description: Keeps implementation and specs in sync. Use when working on a feature that has a spec in .claude/specs/, when the user says /spec, or when starting implementation of a documented feature. Also use when the user asks to verify implementation against a spec or update a spec after changes.
 ---

--- a/.claude/skills
+++ b/.claude/skills
@ -0,0 +1 @@
+plugins/n8n/skills
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1,6 +1,6 @@
 packages/@n8n/db/src/migrations/ @n8n-io/migrations-review
-.github/workflows @n8n-io/ci-admins
-.github/scripts @n8n-io/ci-admins
-.github/actions @n8n-io/ci-admins
-.github/poutine-rules @n8n-io/ci-admins
+.github/workflows @n8n-io/qa-dx
+.github/scripts @n8n-io/qa-dx
+.github/actions @n8n-io/qa-dx
+.github/poutine-rules @n8n-io/qa-dx

--- a/.github/scripts/bump-versions.mjs
+++ b/.github/scripts/bump-versions.mjs
@ -11,7 +11,7 @@ const exec = promisify(child_process.exec);
 /**
 * @param {string | semver.SemVer} currentVersion
 */
-function generateExperimentalVersion(currentVersion) {
+export function generateExperimentalVersion(currentVersion) {
 	const parsed = semver.parse(currentVersion);
 	if (!parsed) throw new Error(`Invalid version: ${currentVersion}`);

@ -28,84 +28,31 @@ function generateExperimentalVersion(currentVersion) {
 	return `${parsed.major}.${parsed.minor}.${parsed.patch}-exp.0`;
 }

-const rootDir = process.cwd();
-
-const releaseType = /** @type { import('semver').ReleaseType | "experimental" } */ (
-	process.env.RELEASE_TYPE
-);
-assert.match(releaseType, /^(patch|minor|major|experimental|premajor)$/, 'Invalid RELEASE_TYPE');
-
-// TODO: if releaseType is `auto` determine release type based on the changelog
-
-const lastTag = (await exec('git describe --tags --match "n8n@*" --abbrev=0')).stdout.trim();
-const packages = JSON.parse(
-	(
-		await exec(
-			`pnpm ls -r --only-projects --json | jq -r '[.[] | { name: .name, version: .version, path: .path,  private: .private}]'`,
-		)
-	).stdout,
-);
-
-const packageMap = {};
-for (let { name, path, version, private: isPrivate } of packages) {
-	if (isPrivate && path !== rootDir) {
-		continue;
-	}
-	if (path === rootDir) {
-		name = 'monorepo-root';
-	}
-
-	const isDirty = await exec(`git diff --quiet HEAD ${lastTag} -- ${path}`)
-		.then(() => false)
-		.catch((error) => true);
-
-	packageMap[name] = { path, isDirty, version };
+/**
+ * @param {{ pnpm?: { overrides?: Record<string, string> }, overrides?: Record<string, string> }} pkg
+ * @returns {Record<string, string>}
+ */
+export function getOverrides(pkg) {
+	return { ...pkg.pnpm?.overrides, ...pkg.overrides };
 }

-assert.ok(
-	Object.values(packageMap).some(({ isDirty }) => isDirty),
-	'No changes found since the last release',
-);
-
-// Propagate isDirty transitively: if a package's dependency will be bumped,
-// that package also needs a bump (e.g. design-system → editor-ui → cli).
-
-// Detect root-level changes that affect resolved dep versions without touching individual
-// package.json files: pnpm.overrides (applies to all specifiers)
-// and pnpm-workspace.yaml catalog entries (applies only to deps using a "catalog:…" specifier).
-
-const rootPkgJson = JSON.parse(await readFile(resolve(rootDir, 'package.json'), 'utf-8'));
-const rootPkgJsonAtTag = await exec(`git show ${lastTag}:package.json`)
-	.then(({ stdout }) => JSON.parse(stdout))
-	.catch(() => ({}));
-
-const getOverrides = (pkg) => ({ ...pkg.pnpm?.overrides, ...pkg.overrides });
-
-const currentOverrides = getOverrides(rootPkgJson);
-const previousOverrides = getOverrides(rootPkgJsonAtTag);
-
-const changedOverrides = new Set(
-	Object.keys({ ...currentOverrides, ...previousOverrides }).filter(
-		(k) => currentOverrides[k] !== previousOverrides[k],
-	),
-);
-
-const parseWorkspaceYaml = (content) => {
+/**
+ * @param {string} content
+ * @returns {Record<string, unknown>}
+ */
+export function parseWorkspaceYaml(content) {
 	try {
 		return /** @type {Record<string, unknown>} */ (parse(content) ?? {});
 	} catch {
 		return {};
 	}
-};
-const workspaceYaml = parseWorkspaceYaml(
-	await readFile(resolve(rootDir, 'pnpm-workspace.yaml'), 'utf-8').catch(() => ''),
-);
-const workspaceYamlAtTag = parseWorkspaceYaml(
-	await exec(`git show ${lastTag}:pnpm-workspace.yaml`)
-		.then(({ stdout }) => stdout)
-		.catch(() => ''),
-);
-const getCatalogs = (ws) => {
+}
+
+/**
+ * @param {Record<string, unknown>} ws
+ * @returns {Map<string, Record<string, string>>}
+ */
+export function getCatalogs(ws) {
 	const result = new Map();
 	if (ws.catalog) {
 		result.set('default', /** @type {Record<string,string>} */ (ws.catalog));
@ -116,98 +63,232 @@ const getCatalogs = (ws) => {
 	}

 	return result;
-};
-// changedCatalogEntries: Map<catalogName, Set<depName>>
-const currentCatalogs = getCatalogs(workspaceYaml);
-const previousCatalogs = getCatalogs(workspaceYamlAtTag);
-const changedCatalogEntries = new Map();
-for (const catalogName of new Set([...currentCatalogs.keys(), ...previousCatalogs.keys()])) {
-	const current = currentCatalogs.get(catalogName) ?? {};
-	const previous = previousCatalogs.get(catalogName) ?? {};
-	const changedDeps = new Set(
-		Object.keys({ ...current, ...previous }).filter((dep) => current[dep] !== previous[dep]),
-	);
-	if (changedDeps.size > 0) {
-		changedCatalogEntries.set(catalogName, changedDeps);
-	}
 }

-// Store full dep objects (with specifiers) so we can inspect "catalog:…" values below.
-const depsByPackage = {};
-for (const packageName in packageMap) {
-	const packageFile = resolve(packageMap[packageName].path, 'package.json');
-	const packageJson = JSON.parse(await readFile(packageFile, 'utf-8'));
-	depsByPackage[packageName] = /** @type {Record<string,string>} */ (
-		packageJson.dependencies ?? {}
+/**
+ * @param {Record<string, string>} currentOverrides
+ * @param {Record<string, string>} previousOverrides
+ * @returns {Set<string>}
+ */
+export function computeChangedOverrides(currentOverrides, previousOverrides) {
+	return new Set(
+		Object.keys({ ...currentOverrides, ...previousOverrides }).filter(
+			(k) => currentOverrides[k] !== previousOverrides[k],
+		),
 	);
 }

-// Mark packages dirty if any dep had a root-level override or catalog version change.
-for (const [packageName, deps] of Object.entries(depsByPackage)) {
-	if (packageMap[packageName].isDirty) continue;
-	for (const [dep, specifier] of Object.entries(deps)) {
-		if (changedOverrides.has(dep)) {
-			packageMap[packageName].isDirty = true;
-			break;
+/**
+ * @param {Map<string, Record<string, string>>} currentCatalogs
+ * @param {Map<string, Record<string, string>>} previousCatalogs
+ * @returns {Map<string, Set<string>>}
+ */
+export function computeChangedCatalogEntries(currentCatalogs, previousCatalogs) {
+	const changedCatalogEntries = new Map();
+	for (const catalogName of new Set([...currentCatalogs.keys(), ...previousCatalogs.keys()])) {
+		const current = currentCatalogs.get(catalogName) ?? {};
+		const previous = previousCatalogs.get(catalogName) ?? {};
+		const changedDeps = new Set(
+			Object.keys({ ...current, ...previous }).filter((dep) => current[dep] !== previous[dep]),
+		);
+		if (changedDeps.size > 0) {
+			changedCatalogEntries.set(catalogName, changedDeps);
 		}
-		if (typeof specifier === 'string' && specifier.startsWith('catalog:')) {
-			const catalogName = specifier === 'catalog:' ? 'default' : specifier.slice(8);
-			if (changedCatalogEntries.get(catalogName)?.has(dep)) {
+	}
+	return changedCatalogEntries;
+}
+
+/**
+ * Mark packages as dirty if any dep had a root-level override or catalog version change.
+ * Mutates packageMap in place.
+ *
+ * @param {Record<string, { isDirty: boolean }>} packageMap
+ * @param {Record<string, Record<string, string>>} depsByPackage
+ * @param {Set<string>} changedOverrides
+ * @param {Map<string, Set<string>>} changedCatalogEntries
+ */
+export function markDirtyByRootChanges(
+	packageMap,
+	depsByPackage,
+	changedOverrides,
+	changedCatalogEntries,
+) {
+	for (const [packageName, deps] of Object.entries(depsByPackage)) {
+		if (packageMap[packageName].isDirty) continue;
+		for (const [dep, specifier] of Object.entries(deps)) {
+			if (changedOverrides.has(dep)) {
 				packageMap[packageName].isDirty = true;
 				break;
 			}
+			if (typeof specifier === 'string' && specifier.startsWith('catalog:')) {
+				const catalogName = specifier === 'catalog:' ? 'default' : specifier.slice(8);
+				if (changedCatalogEntries.get(catalogName)?.has(dep)) {
+					packageMap[packageName].isDirty = true;
+					break;
+				}
+			}
 		}
 	}
 }

-let changed = true;
-while (changed) {
-	changed = false;
-	for (const packageName in packageMap) {
-		if (packageMap[packageName].isDirty) continue;
-		if (Object.keys(depsByPackage[packageName]).some((dep) => packageMap[dep]?.isDirty)) {
-			packageMap[packageName].isDirty = true;
-			changed = true;
+/**
+ * Propagate isDirty transitively: if a package's dependency will be bumped,
+ * that package also needs a bump. Mutates packageMap in place.
+ *
+ * @param {Record<string, { isDirty: boolean }>} packageMap
+ * @param {Record<string, Record<string, string>>} depsByPackage
+ */
+export function propagateDirtyTransitively(packageMap, depsByPackage) {
+	let changed = true;
+	while (changed) {
+		changed = false;
+		for (const packageName in packageMap) {
+			if (packageMap[packageName].isDirty) continue;
+			if (Object.keys(depsByPackage[packageName]).some((dep) => packageMap[dep]?.isDirty)) {
+				packageMap[packageName].isDirty = true;
+				changed = true;
+			}
 		}
 	}
 }

-// Keep the monorepo version up to date with the released version
-packageMap['monorepo-root'].version = packageMap['n8n'].version;
-
-for (const packageName in packageMap) {
-	const { path, version, isDirty } = packageMap[packageName];
-	const packageFile = resolve(path, 'package.json');
-	const packageJson = JSON.parse(await readFile(packageFile, 'utf-8'));
-
-	const dependencyIsDirty = Object.keys(packageJson.dependencies || {}).some(
-		(dependencyName) => packageMap[dependencyName]?.isDirty,
-	);
-
-	let newVersion = version;
-
-	if (isDirty || dependencyIsDirty) {
-		switch (releaseType) {
-			case 'experimental':
-				newVersion = generateExperimentalVersion(version);
-				break;
-			case 'premajor':
-				newVersion = semver.inc(
+/**
+ * @param {string} version
+ * @param {import('semver').ReleaseType | 'experimental'} releaseType
+ * @returns {string}
+ */
+export function computeNewVersion(version, releaseType) {
+	switch (releaseType) {
+		case 'experimental':
+			return generateExperimentalVersion(version);
+		case 'premajor':
+			return /** @type {string} */ (
+				semver.inc(
 					version,
 					version.includes('-rc.') ? 'prerelease' : 'premajor',
 					undefined,
 					'rc',
-				);
-				break;
-			default:
-				newVersion = semver.inc(version, releaseType);
-				break;
-		}
+				)
+			);
+		default:
+			return /** @type {string} */ (semver.inc(version, releaseType));
 	}
-
-	packageJson.version = packageMap[packageName].nextVersion = newVersion;
-
-	await writeFile(packageFile, JSON.stringify(packageJson, null, 2) + '\n');
 }

-console.log(packageMap['n8n'].nextVersion);
+async function bumpVersions() {
+	const rootDir = process.cwd();
+
+	const releaseType = /** @type { import('semver').ReleaseType | "experimental" } */ (
+		process.env.RELEASE_TYPE
+	);
+	assert.match(releaseType, /^(patch|minor|major|experimental|premajor)$/, 'Invalid RELEASE_TYPE');
+
+	// TODO: if releaseType is `auto` determine release type based on the changelog
+
+	const lastTag = (await exec('git describe --tags --match "n8n@*" --abbrev=0')).stdout.trim();
+	const packages = JSON.parse(
+		(
+			await exec(
+				`pnpm ls -r --only-projects --json | jq -r '[.[] | { name: .name, version: .version, path: .path,  private: .private}]'`,
+			)
+		).stdout,
+	);
+
+	/** @type {Record<string, { path: string, isDirty: boolean, version: string, nextVersion?: string }>} */
+	const packageMap = {};
+	for (let { name, path, version, private: isPrivate } of packages) {
+		if (isPrivate && path !== rootDir) {
+			continue;
+		}
+		if (path === rootDir) {
+			name = 'monorepo-root';
+		}
+
+		const isDirty = await exec(`git diff --quiet HEAD ${lastTag} -- ${path}`)
+			.then(() => false)
+			.catch(() => true);
+
+		packageMap[name] = { path, isDirty, version };
+	}
+
+	assert.ok(
+		Object.values(packageMap).some(({ isDirty }) => isDirty),
+		'No changes found since the last release',
+	);
+
+	// Propagate isDirty transitively: if a package's dependency will be bumped,
+	// that package also needs a bump (e.g. design-system → editor-ui → cli).
+
+	// Detect root-level changes that affect resolved dep versions without touching individual
+	// package.json files: pnpm.overrides (applies to all specifiers)
+	// and pnpm-workspace.yaml catalog entries (applies only to deps using a "catalog:…" specifier).
+
+	const rootPkgJson = JSON.parse(await readFile(resolve(rootDir, 'package.json'), 'utf-8'));
+	const rootPkgJsonAtTag = await exec(`git show ${lastTag}:package.json`)
+		.then(({ stdout }) => JSON.parse(stdout))
+		.catch(() => ({}));
+
+	const changedOverrides = computeChangedOverrides(
+		getOverrides(rootPkgJson),
+		getOverrides(rootPkgJsonAtTag),
+	);
+
+	const workspaceYaml = parseWorkspaceYaml(
+		await readFile(resolve(rootDir, 'pnpm-workspace.yaml'), 'utf-8').catch(() => ''),
+	);
+	const workspaceYamlAtTag = parseWorkspaceYaml(
+		await exec(`git show ${lastTag}:pnpm-workspace.yaml`)
+			.then(({ stdout }) => stdout)
+			.catch(() => ''),
+	);
+	const changedCatalogEntries = computeChangedCatalogEntries(
+		getCatalogs(workspaceYaml),
+		getCatalogs(workspaceYamlAtTag),
+	);
+
+	// Store full dep objects (with specifiers) so we can inspect "catalog:…" values below.
+	/** @type {Record<string, Record<string, string>>} */
+	const depsByPackage = {};
+	for (const packageName in packageMap) {
+		const packageFile = resolve(packageMap[packageName].path, 'package.json');
+		const packageJson = JSON.parse(await readFile(packageFile, 'utf-8'));
+		depsByPackage[packageName] = /** @type {Record<string,string>} */ (
+			packageJson.dependencies ?? {}
+		);
+	}
+
+	// Mark packages dirty if any dep had a root-level override or catalog version change.
+	markDirtyByRootChanges(packageMap, depsByPackage, changedOverrides, changedCatalogEntries);
+
+	propagateDirtyTransitively(packageMap, depsByPackage);
+
+	// Keep the monorepo version up to date with the released version
+	packageMap['monorepo-root'].version = packageMap['n8n'].version;
+
+	for (const packageName in packageMap) {
+		const { path, version, isDirty } = packageMap[packageName];
+		const packageFile = resolve(path, 'package.json');
+		const packageJson = JSON.parse(await readFile(packageFile, 'utf-8'));
+
+		const dependencyIsDirty = Object.keys(packageJson.dependencies || {}).some(
+			(dependencyName) => packageMap[dependencyName]?.isDirty,
+		);
+
+		let newVersion = version;
+
+		if (isDirty || dependencyIsDirty) {
+			newVersion = computeNewVersion(version, releaseType);
+		}
+
+		packageJson.version = packageMap[packageName].nextVersion = newVersion;
+
+		await writeFile(packageFile, JSON.stringify(packageJson, null, 2) + '\n');
+	}
+
+	console.log(packageMap['n8n'].nextVersion);
+}
+
+// only run when executed directly, not when imported by tests
+if (import.meta.url === `file://${process.argv[1]}`) {
+	bumpVersions();
+}
--- a/.github/scripts/bump-versions.test.mjs
+++ b/.github/scripts/bump-versions.test.mjs
@ -0,0 +1,380 @@
+/**
+ * Run these tests with:
+ *
+ * node --test ./.github/scripts/bump-versions.test.mjs
+ */
+
+import { describe, it } from 'node:test';
+import assert from 'node:assert/strict';
+import {
+	generateExperimentalVersion,
+	getOverrides,
+	parseWorkspaceYaml,
+	getCatalogs,
+	computeChangedOverrides,
+	computeChangedCatalogEntries,
+	markDirtyByRootChanges,
+	propagateDirtyTransitively,
+	computeNewVersion,
+} from './bump-versions.mjs';
+
+describe('generateExperimentalVersion', () => {
+	it('creates -exp.0 from a stable version', () => {
+		assert.equal(generateExperimentalVersion('1.2.3'), '1.2.3-exp.0');
+	});
+
+	it('increments exp minor when already at exp.0', () => {
+		assert.equal(generateExperimentalVersion('1.2.3-exp.0'), '1.2.3-exp.1');
+	});
+
+	it('increments exp minor when already at exp.5', () => {
+		assert.equal(generateExperimentalVersion('1.2.3-exp.5'), '1.2.3-exp.6');
+	});
+
+	it('creates -exp.0 from a version with a different pre-release tag', () => {
+		assert.equal(generateExperimentalVersion('1.2.3-beta.1'), '1.2.3-exp.0');
+	});
+
+	it('handles multi-digit version numbers', () => {
+		assert.equal(generateExperimentalVersion('10.20.30'), '10.20.30-exp.0');
+	});
+
+	it('throws on an invalid version string', () => {
+		assert.throws(() => generateExperimentalVersion('not-a-version'), /Invalid version/);
+	});
+});
+
+describe('getOverrides', () => {
+	it('returns empty object when no overrides exist', () => {
+		assert.deepEqual(getOverrides({}), {});
+	});
+
+	it('returns pnpm.overrides when only pnpm.overrides is set', () => {
+		assert.deepEqual(getOverrides({ pnpm: { overrides: { lodash: '^4.0.0' } } }), {
+			lodash: '^4.0.0',
+		});
+	});
+
+	it('returns overrides when only top-level overrides is set', () => {
+		assert.deepEqual(getOverrides({ overrides: { lodash: '^4.0.0' } }), { lodash: '^4.0.0' });
+	});
+
+	it('merges both fields with top-level overrides taking precedence for the same key', () => {
+		assert.deepEqual(
+			getOverrides({
+				pnpm: { overrides: { lodash: '^3.0.0', underscore: '^1.0.0' } },
+				overrides: { lodash: '^4.0.0' },
+			}),
+			{ lodash: '^4.0.0', underscore: '^1.0.0' },
+		);
+	});
+});
+
+describe('parseWorkspaceYaml', () => {
+	it('parses valid YAML into an object', () => {
+		assert.deepEqual(parseWorkspaceYaml('catalog:\n  lodash: "^4.0.0"'), {
+			catalog: { lodash: '^4.0.0' },
+		});
+	});
+
+	it('returns empty object for an empty string', () => {
+		assert.deepEqual(parseWorkspaceYaml(''), {});
+	});
+
+	it('returns empty object for invalid YAML', () => {
+		assert.deepEqual(parseWorkspaceYaml(': - invalid: [yaml}'), {});
+	});
+});
+
+describe('getCatalogs', () => {
+	it('returns empty map when no catalog or catalogs field exists', () => {
+		assert.equal(getCatalogs({}).size, 0);
+	});
+
+	it('returns a "default" entry for the top-level catalog field', () => {
+		const result = getCatalogs({ catalog: { lodash: '^4.0.0' } });
+		assert.equal(result.size, 1);
+		assert.deepEqual(result.get('default'), { lodash: '^4.0.0' });
+	});
+
+	it('returns named entries from the catalogs field', () => {
+		const result = getCatalogs({ catalogs: { react18: { react: '^18.0.0' } } });
+		assert.equal(result.size, 1);
+		assert.deepEqual(result.get('react18'), { react: '^18.0.0' });
+	});
+
+	it('returns both default and named catalog entries when both fields are present', () => {
+		const result = getCatalogs({
+			catalog: { lodash: '^4.0.0' },
+			catalogs: { react18: { react: '^18.0.0' } },
+		});
+		assert.equal(result.size, 2);
+		assert.deepEqual(result.get('default'), { lodash: '^4.0.0' });
+		assert.deepEqual(result.get('react18'), { react: '^18.0.0' });
+	});
+});
+
+describe('computeChangedOverrides', () => {
+	it('returns empty set when nothing changed', () => {
+		assert.equal(computeChangedOverrides({ lodash: '^4' }, { lodash: '^4' }).size, 0);
+	});
+
+	it('detects an added override', () => {
+		const result = computeChangedOverrides({ lodash: '^4' }, {});
+		assert.ok(result.has('lodash'));
+	});
+
+	it('detects a removed override', () => {
+		const result = computeChangedOverrides({}, { lodash: '^4' });
+		assert.ok(result.has('lodash'));
+	});
+
+	it('detects a changed override value', () => {
+		const result = computeChangedOverrides({ lodash: '^4' }, { lodash: '^3' });
+		assert.ok(result.has('lodash'));
+	});
+
+	it('does not include unchanged overrides', () => {
+		const result = computeChangedOverrides(
+			{ lodash: '^4', underscore: '^1' },
+			{ lodash: '^4', underscore: '^1' },
+		);
+		assert.equal(result.size, 0);
+	});
+
+	it('handles mixed changed and unchanged overrides', () => {
+		const result = computeChangedOverrides(
+			{ lodash: '^4', underscore: '^2' },
+			{ lodash: '^4', underscore: '^1' },
+		);
+		assert.equal(result.size, 1);
+		assert.ok(result.has('underscore'));
+		assert.ok(!result.has('lodash'));
+	});
+});
+
+describe('computeChangedCatalogEntries', () => {
+	it('returns empty map when nothing changed', () => {
+		const current = new Map([['default', { lodash: '^4' }]]);
+		const previous = new Map([['default', { lodash: '^4' }]]);
+		assert.equal(computeChangedCatalogEntries(current, previous).size, 0);
+	});
+
+	it('detects an added dep in a catalog', () => {
+		const current = new Map([['default', { lodash: '^4' }]]);
+		const previous = new Map([['default', {}]]);
+		const result = computeChangedCatalogEntries(current, previous);
+		assert.ok(result.get('default')?.has('lodash'));
+	});
+
+	it('detects a removed dep from a catalog', () => {
+		const current = new Map([['default', {}]]);
+		const previous = new Map([['default', { lodash: '^4' }]]);
+		const result = computeChangedCatalogEntries(current, previous);
+		assert.ok(result.get('default')?.has('lodash'));
+	});
+
+	it('detects a changed dep version in a catalog', () => {
+		const current = new Map([['default', { lodash: '^4' }]]);
+		const previous = new Map([['default', { lodash: '^3' }]]);
+		const result = computeChangedCatalogEntries(current, previous);
+		assert.ok(result.get('default')?.has('lodash'));
+	});
+
+	it('detects changes in a named catalog', () => {
+		const current = new Map([['react18', { react: '^18' }]]);
+		const previous = new Map([['react18', { react: '^17' }]]);
+		const result = computeChangedCatalogEntries(current, previous);
+		assert.ok(result.get('react18')?.has('react'));
+	});
+
+	it('detects a newly added catalog', () => {
+		const current = new Map([['newCatalog', { lodash: '^4' }]]);
+		const previous = new Map();
+		const result = computeChangedCatalogEntries(current, previous);
+		assert.ok(result.get('newCatalog')?.has('lodash'));
+	});
+
+	it('detects a removed catalog', () => {
+		const current = new Map();
+		const previous = new Map([['oldCatalog', { lodash: '^4' }]]);
+		const result = computeChangedCatalogEntries(current, previous);
+		assert.ok(result.get('oldCatalog')?.has('lodash'));
+	});
+
+	it('does not include a catalog that has no changed entries', () => {
+		const current = new Map([
+			['default', { lodash: '^4' }],
+			['react18', { react: '^18' }],
+		]);
+		const previous = new Map([
+			['default', { lodash: '^3' }],
+			['react18', { react: '^18' }],
+		]);
+		const result = computeChangedCatalogEntries(current, previous);
+		assert.ok(result.has('default'));
+		assert.ok(!result.has('react18'));
+	});
+});
+
+describe('markDirtyByRootChanges', () => {
+	it('marks a package dirty when its dep appears in changedOverrides', () => {
+		const packageMap = { 'pkg-a': { isDirty: false } };
+		const depsByPackage = { 'pkg-a': { lodash: '^4' } };
+		markDirtyByRootChanges(packageMap, depsByPackage, new Set(['lodash']), new Map());
+		assert.ok(packageMap['pkg-a'].isDirty);
+	});
+
+	it('skips already-dirty packages', () => {
+		const packageMap = { 'pkg-a': { isDirty: true } };
+		// No deps, but package is already dirty — should not throw or change state
+		const depsByPackage = { 'pkg-a': {} };
+		markDirtyByRootChanges(packageMap, depsByPackage, new Set(['lodash']), new Map());
+		assert.ok(packageMap['pkg-a'].isDirty);
+	});
+
+	it('marks a package dirty when its dep uses "catalog:" (default catalog) and that entry changed', () => {
+		const packageMap = { 'pkg-a': { isDirty: false } };
+		const depsByPackage = { 'pkg-a': { lodash: 'catalog:' } };
+		const changedCatalogEntries = new Map([['default', new Set(['lodash'])]]);
+		markDirtyByRootChanges(packageMap, depsByPackage, new Set(), changedCatalogEntries);
+		assert.ok(packageMap['pkg-a'].isDirty);
+	});
+
+	it('marks a package dirty when its dep uses "catalog:<name>" and that named catalog entry changed', () => {
+		const packageMap = { 'pkg-a': { isDirty: false } };
+		const depsByPackage = { 'pkg-a': { react: 'catalog:react18' } };
+		const changedCatalogEntries = new Map([['react18', new Set(['react'])]]);
+		markDirtyByRootChanges(packageMap, depsByPackage, new Set(), changedCatalogEntries);
+		assert.ok(packageMap['pkg-a'].isDirty);
+	});
+
+	it('does not mark a package dirty when none of its deps changed', () => {
+		const packageMap = { 'pkg-a': { isDirty: false } };
+		const depsByPackage = { 'pkg-a': { lodash: '^4' } };
+		markDirtyByRootChanges(packageMap, depsByPackage, new Set(['underscore']), new Map());
+		assert.ok(!packageMap['pkg-a'].isDirty);
+	});
+
+	it('does not mark a package dirty when a catalog: dep is in a catalog with no changes', () => {
+		const packageMap = { 'pkg-a': { isDirty: false } };
+		const depsByPackage = { 'pkg-a': { lodash: 'catalog:' } };
+		const changedCatalogEntries = new Map([['default', new Set(['underscore'])]]);
+		markDirtyByRootChanges(packageMap, depsByPackage, new Set(), changedCatalogEntries);
+		assert.ok(!packageMap['pkg-a'].isDirty);
+	});
+
+	it('does not mark a package dirty when a catalog: dep is in a different catalog than the one that changed', () => {
+		const packageMap = { 'pkg-a': { isDirty: false } };
+		const depsByPackage = { 'pkg-a': { react: 'catalog:react18' } };
+		const changedCatalogEntries = new Map([['default', new Set(['react'])]]);
+		markDirtyByRootChanges(packageMap, depsByPackage, new Set(), changedCatalogEntries);
+		assert.ok(!packageMap['pkg-a'].isDirty);
+	});
+});
+
+describe('propagateDirtyTransitively', () => {
+	it('does nothing when no packages are dirty', () => {
+		const packageMap = {
+			'pkg-a': { isDirty: false },
+			'pkg-b': { isDirty: false },
+		};
+		const depsByPackage = {
+			'pkg-a': { 'pkg-b': 'workspace:*' },
+			'pkg-b': {},
+		};
+		propagateDirtyTransitively(packageMap, depsByPackage);
+		assert.ok(!packageMap['pkg-a'].isDirty);
+		assert.ok(!packageMap['pkg-b'].isDirty);
+	});
+
+	it('propagates dirty state one level up the dependency chain', () => {
+		const packageMap = {
+			'pkg-a': { isDirty: false },
+			'pkg-b': { isDirty: true },
+		};
+		const depsByPackage = {
+			'pkg-a': { 'pkg-b': 'workspace:*' },
+			'pkg-b': {},
+		};
+		propagateDirtyTransitively(packageMap, depsByPackage);
+		assert.ok(packageMap['pkg-a'].isDirty);
+	});
+
+	it('propagates dirty state through multiple levels', () => {
+		const packageMap = {
+			'pkg-a': { isDirty: false },
+			'pkg-b': { isDirty: false },
+			'pkg-c': { isDirty: true },
+		};
+		const depsByPackage = {
+			'pkg-a': { 'pkg-b': 'workspace:*' },
+			'pkg-b': { 'pkg-c': 'workspace:*' },
+			'pkg-c': {},
+		};
+		propagateDirtyTransitively(packageMap, depsByPackage);
+		assert.ok(packageMap['pkg-b'].isDirty, 'pkg-b should be dirty (depends on dirty pkg-c)');
+		assert.ok(packageMap['pkg-a'].isDirty, 'pkg-a should be dirty (depends on dirty pkg-b)');
+	});
+
+	it('does not mark packages dirty when their deps are external (not in packageMap)', () => {
+		const packageMap = { 'pkg-a': { isDirty: false } };
+		const depsByPackage = { 'pkg-a': { lodash: '^4' } };
+		propagateDirtyTransitively(packageMap, depsByPackage);
+		assert.ok(!packageMap['pkg-a'].isDirty);
+	});
+
+	it('handles diamond dependency graphs without infinite loops', () => {
+		// pkg-a depends on pkg-b and pkg-c; both depend on pkg-d (dirty)
+		const packageMap = {
+			'pkg-a': { isDirty: false },
+			'pkg-b': { isDirty: false },
+			'pkg-c': { isDirty: false },
+			'pkg-d': { isDirty: true },
+		};
+		const depsByPackage = {
+			'pkg-a': { 'pkg-b': 'workspace:*', 'pkg-c': 'workspace:*' },
+			'pkg-b': { 'pkg-d': 'workspace:*' },
+			'pkg-c': { 'pkg-d': 'workspace:*' },
+			'pkg-d': {},
+		};
+		propagateDirtyTransitively(packageMap, depsByPackage);
+		assert.ok(packageMap['pkg-b'].isDirty);
+		assert.ok(packageMap['pkg-c'].isDirty);
+		assert.ok(packageMap['pkg-a'].isDirty);
+	});
+});
+
+describe('computeNewVersion', () => {
+	it('increments patch version', () => {
+		assert.equal(computeNewVersion('1.2.3', 'patch'), '1.2.4');
+	});
+
+	it('increments minor version (resets patch)', () => {
+		assert.equal(computeNewVersion('1.2.3', 'minor'), '1.3.0');
+	});
+
+	it('increments major version (resets minor and patch)', () => {
+		assert.equal(computeNewVersion('1.2.3', 'major'), '2.0.0');
+	});
+
+	it('creates -exp.0 from a stable version for experimental', () => {
+		assert.equal(computeNewVersion('1.2.3', 'experimental'), '1.2.3-exp.0');
+	});
+
+	it('increments exp minor for experimental when already an exp version', () => {
+		assert.equal(computeNewVersion('1.2.3-exp.0', 'experimental'), '1.2.3-exp.1');
+	});
+
+	it('creates a premajor rc version from a stable version', () => {
+		assert.equal(computeNewVersion('1.2.3', 'premajor'), '2.0.0-rc.0');
+	});
+
+	it('increments the rc prerelease number for premajor when already an rc version', () => {
+		assert.equal(computeNewVersion('2.0.0-rc.0', 'premajor'), '2.0.0-rc.1');
+	});
+
+	it('increments rc correctly across multiple premajor calls', () => {
+		assert.equal(computeNewVersion('2.0.0-rc.4', 'premajor'), '2.0.0-rc.5');
+	});
+});
--- a/.github/scripts/quality/check-pr-size.mjs
+++ b/.github/scripts/quality/check-pr-size.mjs
@ -40,6 +40,8 @@ export const EXCLUDE_PATTERNS = [
 	'packages/testing/**',
 	// Lock file (can produce massive diffs on dependency changes)
 	'pnpm-lock.yaml',
+	'**/*.md',
+	'**/*.mdx'
 ];

 const BOT_MARKER = '<!-- pr-size-check -->';
--- a/.github/scripts/quality/check-pr-size.test.mjs
+++ b/.github/scripts/quality/check-pr-size.test.mjs
@ -203,4 +203,13 @@ describe('countFilteredAdditions', () => {
 		];
 		assert.equal(countFilteredAdditions(files, EXCLUDE_PATTERNS), 50);
 	});
+
+	it('applies EXCLUDE_PATTERNS to markdown files', () => {
+		const files = [
+			{ filename: 'packages/cli/src/service.ts', additions: 50 },
+			{ filename: 'packages/cli/AGENTS.md', additions: 100 },
+			{ filename: 'packages/frontend/STORIES.mdx', additions: 100 },
+		];
+		assert.equal(countFilteredAdditions(files, EXCLUDE_PATTERNS), 50);
+	});
 });
--- a/.github/workflows/ci-pr-quality.yml
+++ b/.github/workflows/ci-pr-quality.yml
@ -1,6 +1,7 @@
 name: 'CI: PR Quality Checks'

 on:
+  merge_group:
  pull_request:
    types:
      - opened
@ -99,3 +100,21 @@ jobs:
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: node .github/scripts/quality/check-pr-size.mjs
+
+  required-pr-quality-checks:
+    name: Required PR Quality Checks
+    needs: [check-ownership-checkbox, check-pr-size]
+    if: always()
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          sparse-checkout: .github/actions/ci-filter
+          sparse-checkout-cone-mode: false
+      - name: Validate required checks
+        uses: ./.github/actions/ci-filter
+        with:
+          mode: validate
+          job-results: ${{ toJSON(needs) }}
+
--- a/.github/workflows/release-publish.yml
+++ b/.github/workflows/release-publish.yml
@ -107,7 +107,7 @@ jobs:

  build-daytona-snapshot:
    name: Build Daytona snapshot
-    needs: [determine-version-info]
+    needs: [determine-version-info, publish-to-npm]
    if: github.event.pull_request.merged == true
    uses: ./.github/workflows/release-build-daytona-snapshot.yml
    with:
--- a/.github/workflows/test-e2e-infrastructure-reusable.yml
+++ b/.github/workflows/test-e2e-infrastructure-reusable.yml
@ -37,7 +37,7 @@ jobs:
    uses: ./.github/workflows/test-e2e-reusable.yml
    with:
      test-mode: docker-artifact
-      test-command: pnpm --filter=n8n-playwright test:all --project='${{ matrix.profile }}:infrastructure' --workers=1
+      test-command: pnpm --filter=n8n-playwright test:all --project=${{ matrix.profile }}:infrastructure --workers=1
      runner: ${{ matrix.runner }}
      timeout-minutes: 60
    secrets: inherit
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,119 @@
+# [2.20.0](https://github.com/n8n-io/n8n/compare/n8n@2.19.0...n8n@2.20.0) (2026-05-05)
+
+
+### Bug Fixes
+
+* **ai-builder:** Add boundaries on the workflow builder remediation loops ([#29430](https://github.com/n8n-io/n8n/issues/29430)) ([2259f32](https://github.com/n8n-io/n8n/commit/2259f32de88c103b088b450bf46990ad2e939942))
+* **ai-builder:** Allow skipping final ask-user question ([#29563](https://github.com/n8n-io/n8n/issues/29563)) ([661f990](https://github.com/n8n-io/n8n/commit/661f9908bce51076811c76c854f165f4c5acaccf))
+* **ai-builder:** Filter LangSmith eval dataset by local file slugs ([#29507](https://github.com/n8n-io/n8n/issues/29507)) ([54d9286](https://github.com/n8n-io/n8n/commit/54d9286d922e0cad17d5c5de10a052d653c1591b))
+* **ai-builder:** Handle properties with contradicting displayOptions as OR alternatives instead of AND ([#29500](https://github.com/n8n-io/n8n/issues/29500)) ([84ac811](https://github.com/n8n-io/n8n/commit/84ac8110f8d70dd653b4d40cb63259522731b0d0))
+* **ai-builder:** Stop builder from adding auth to inbound trigger nodes by default ([#29648](https://github.com/n8n-io/n8n/issues/29648)) ([c28d501](https://github.com/n8n-io/n8n/commit/c28d501ba1630861fa0993d0d85f08efb635a5a4))
+* Allow 5-field cron expressions with step values in polling nodes ([#29447](https://github.com/n8n-io/n8n/issues/29447)) ([d18f183](https://github.com/n8n-io/n8n/commit/d18f183b211416d5b74cfdc2e740b9c663ede134))
+* **Anthropic Chat Model Node:** Add adaptive thinking mode for Claude Opus 4.7+ ([#29467](https://github.com/n8n-io/n8n/issues/29467)) ([90d875c](https://github.com/n8n-io/n8n/commit/90d875ce3e5a2a004a5a3d8f28ac4e9820b109f4))
+* **Compare Datasets Node:** Preserve falsy values in mix mode except fields ([#29666](https://github.com/n8n-io/n8n/issues/29666)) ([62ddc5c](https://github.com/n8n-io/n8n/commit/62ddc5c443273559c286a1d2eb19efdca345ac9a))
+* **core:** Accept placeholder() inside node credentials slot ([#29691](https://github.com/n8n-io/n8n/issues/29691)) ([dc6bd68](https://github.com/n8n-io/n8n/commit/dc6bd68de3b419fb1e23806781bbc125b621ed8a))
+* **core:** Acquire expression isolate for dynamic node parameter requests ([#29671](https://github.com/n8n-io/n8n/issues/29671)) ([418f1f2](https://github.com/n8n-io/n8n/commit/418f1f2edb6abfebe1085b8c3b5c1b22530f1a5c))
+* **core:** Add file path validation to localFile source ([#29464](https://github.com/n8n-io/n8n/issues/29464)) ([7277566](https://github.com/n8n-io/n8n/commit/7277566c64c36f5e43c17a2e620da2408ab1dcb7))
+* **core:** Add GET handler to MCP endpoint for Streamable HTTP spec compliance ([#28787](https://github.com/n8n-io/n8n/issues/28787)) ([4ae0322](https://github.com/n8n-io/n8n/commit/4ae0322ef246348892000d0539904e56c122d204))
+* **core:** Add timeout to external secrets provider refresh ([#29679](https://github.com/n8n-io/n8n/issues/29679)) ([e350429](https://github.com/n8n-io/n8n/commit/e35042999f7d477ed1da59f43ef03605763ac2bf))
+* **core:** Apply credential allowed domains in declarative node requests ([#29082](https://github.com/n8n-io/n8n/issues/29082)) ([8551b1b](https://github.com/n8n-io/n8n/commit/8551b1b90ce16b31a017bd07177694ef39ad226d))
+* **core:** Correct LDAP search filter construction ([#29388](https://github.com/n8n-io/n8n/issues/29388)) ([32dd743](https://github.com/n8n-io/n8n/commit/32dd7433b7ef168161e32c20939859060da9827c))
+* **core:** Fix code node executions hanging when idle timer overlaps with task acceptance ([#29239](https://github.com/n8n-io/n8n/issues/29239)) ([7bd3532](https://github.com/n8n-io/n8n/commit/7bd3532f07c151568634e84f3ae24f38ab8e60e4))
+* **core:** Fix MCP OAuth discovery URL construction and grant type selection ([#27283](https://github.com/n8n-io/n8n/issues/27283)) ([d92ec16](https://github.com/n8n-io/n8n/commit/d92ec168aa5f984513874e2978f73d8f2cbdc80e))
+* **core:** Force saving executions when instance AI executes WFs ([#29515](https://github.com/n8n-io/n8n/issues/29515)) ([ef56501](https://github.com/n8n-io/n8n/commit/ef56501d4729b5b508a4c5e60263d10a8fc9db76))
+* **core:** Gate Instance AI edits to pre-existing workflows ([#29501](https://github.com/n8n-io/n8n/issues/29501)) ([6175fd6](https://github.com/n8n-io/n8n/commit/6175fd6f7b56ead0176938657085b763c1204681))
+* **core:** Generate array types for properties with multipleValues                   ([#29410](https://github.com/n8n-io/n8n/issues/29410)) ([fb65c61](https://github.com/n8n-io/n8n/commit/fb65c6155ee9ae5b11a2c409f35e98c206aaf164))
+* **core:** Handle missing runData during execution recovery ([#29513](https://github.com/n8n-io/n8n/issues/29513)) ([8b7b4f5](https://github.com/n8n-io/n8n/commit/8b7b4f575d9d9b5b02a8ddf67aaff6b3d5279d78))
+* **core:** Harden Set node workflow SDK contract ([#29568](https://github.com/n8n-io/n8n/issues/29568)) ([625ed5e](https://github.com/n8n-io/n8n/commit/625ed5e95a90f30e07e88253515713056e406f5b))
+* **core:** Include stack trace in error logs for non-ApplicationError errors ([#29496](https://github.com/n8n-io/n8n/issues/29496)) ([16d1461](https://github.com/n8n-io/n8n/commit/16d1461858107697eac399039c834c7296fe8868))
+* **core:** Increase default task runner grant token TTL to 30s ([#29443](https://github.com/n8n-io/n8n/issues/29443)) ([328f4b8](https://github.com/n8n-io/n8n/commit/328f4b8b964d587763bf14b1980916046878f0f0))
+* **core:** Isolate expressions on chat resumption and test webhook deactivation ([#29703](https://github.com/n8n-io/n8n/issues/29703)) ([568e5a2](https://github.com/n8n-io/n8n/commit/568e5a24bf8f4e73d0b134dbac1631535bba10a7))
+* **core:** Make MCP client registration cap tunable and surface a proper limit error ([#29429](https://github.com/n8n-io/n8n/issues/29429)) ([dad4231](https://github.com/n8n-io/n8n/commit/dad423155f1ee105e3ed1eab0b65a8d8bc2ee3a3))
+* **core:** Make task runner grant token TTL configurable ([#29357](https://github.com/n8n-io/n8n/issues/29357)) ([3f350a8](https://github.com/n8n-io/n8n/commit/3f350a85770680895be5723803ef51453476fed2))
+* **core:** Pass nodeTypesProvider to validate workflows fully at instance AI ([#29333](https://github.com/n8n-io/n8n/issues/29333)) ([388cd79](https://github.com/n8n-io/n8n/commit/388cd79908418d558fff36f938969cdc79fc60c2))
+* **core:** Persist execution context before writing to db ([#28973](https://github.com/n8n-io/n8n/issues/28973)) ([c4bb5ae](https://github.com/n8n-io/n8n/commit/c4bb5ae8df8e7de4c7b919a82d3cf2f492edcc5b))
+* **core:** Recreate data table backing tables on entity import ([#29454](https://github.com/n8n-io/n8n/issues/29454)) ([6bca1fa](https://github.com/n8n-io/n8n/commit/6bca1fa26f0d1a23c8c7e175dc6ae590eeb2036e))
+* **core:** Reject empty webhookMethods in community lint rule ([#29474](https://github.com/n8n-io/n8n/issues/29474)) ([34d7a02](https://github.com/n8n-io/n8n/commit/34d7a02df73f233ef55fc78e3ea8167bc2b32a1f))
+* **core:** Reset Redis retry counter on successful reconnect ([#29377](https://github.com/n8n-io/n8n/issues/29377)) ([7722023](https://github.com/n8n-io/n8n/commit/7722023abd8ffb2f96a7dbec0ba51e4d7454ea05))
+* **core:** Respect global admin scope when listing favorites ([#29472](https://github.com/n8n-io/n8n/issues/29472)) ([d9d1e7c](https://github.com/n8n-io/n8n/commit/d9d1e7c44a1bcf074cdbec234b0d8d4ddb8d7d5e))
+* **core:** Restore peer project discovery in share dropdowns ([#29537](https://github.com/n8n-io/n8n/issues/29537)) ([2a0e2fb](https://github.com/n8n-io/n8n/commit/2a0e2fb47ae1d82cd2354db8c2013ea46f24f21e))
+* **core:** Round fractional time saved values before inserting into insights BIGINT column ([#29553](https://github.com/n8n-io/n8n/issues/29553)) ([74d55b9](https://github.com/n8n-io/n8n/commit/74d55b9c681273ae79fbaf39693bd3b37d83b66a))
+* **core:** Show AI Builder draft workflows in workflow list ([#29670](https://github.com/n8n-io/n8n/issues/29670)) ([dc52bbd](https://github.com/n8n-io/n8n/commit/dc52bbd5329a27245a5fe2a1da45d9e8efe6a549))
+* **core:** Use editor base URL for workflow and execution links ([#23630](https://github.com/n8n-io/n8n/issues/23630)) ([896461b](https://github.com/n8n-io/n8n/commit/896461bee3c356e66b282763cd31427a137ebd62))
+* **core:** Validate workflow import URL requests ([#29178](https://github.com/n8n-io/n8n/issues/29178)) ([ecd0ba8](https://github.com/n8n-io/n8n/commit/ecd0ba8ebabc99055441290d543f0bd87a33df31))
+* **core:** Wire EncryptionKeyProxy provider on bootstrap ([#29581](https://github.com/n8n-io/n8n/issues/29581)) ([ee7260c](https://github.com/n8n-io/n8n/commit/ee7260c4959b0dff8636606aebdac10eddd76e36))
+* **DeepL Node:** Update credentials to use header-based authentication ([#24614](https://github.com/n8n-io/n8n/issues/24614)) ([b72bd19](https://github.com/n8n-io/n8n/commit/b72bd1987c33b15cd658d2a038b9763c6fb83b55))
+* Drop template search tools from builder ([#29573](https://github.com/n8n-io/n8n/issues/29573)) ([9b00ccb](https://github.com/n8n-io/n8n/commit/9b00ccbfd1cfb123533397126123f5d2ad34071f))
+* **editor:** Add proper bg color for hover state with color-mix() ([#29590](https://github.com/n8n-io/n8n/issues/29590)) ([6698c42](https://github.com/n8n-io/n8n/commit/6698c42e4ed4706825f5d2e3bac39641e261f153))
+* **editor:** Align message box button radius with N8nButton ([#29397](https://github.com/n8n-io/n8n/issues/29397)) ([bc315d0](https://github.com/n8n-io/n8n/commit/bc315d087fd772218b2f3caa047c86493c048f27))
+* **editor:** Fix OAuth2 credential showing "Needs first setup" after connecting ([#29617](https://github.com/n8n-io/n8n/issues/29617)) ([243f665](https://github.com/n8n-io/n8n/commit/243f665e60bff1c2531977c3f860aa7589a321e9))
+* **editor:** Fix sub-workflow folder placement and connection loss ([#28770](https://github.com/n8n-io/n8n/issues/28770)) ([44579d6](https://github.com/n8n-io/n8n/commit/44579d6d3ae59a1f4eedf9a0b49cecb006053072))
+* **editor:** Ignore paste events on read-only canvas ([#29673](https://github.com/n8n-io/n8n/issues/29673)) ([34c49b9](https://github.com/n8n-io/n8n/commit/34c49b9c238de5d5ee0b9421918435c4582eb13a))
+* **editor:** Keep publish actions menu enabled for published workflows ([#29396](https://github.com/n8n-io/n8n/issues/29396)) ([c65fa28](https://github.com/n8n-io/n8n/commit/c65fa28e1caac5a49e6a5e82d3354ed631be0df4))
+* **editor:** Load more executions on tall screens ([#29407](https://github.com/n8n-io/n8n/issues/29407)) ([a273a9d](https://github.com/n8n-io/n8n/commit/a273a9d3f498d8112605f1277ce7848d8bd357c3))
+* **editor:** Make instance ai resource link chips open resources ([#29577](https://github.com/n8n-io/n8n/issues/29577)) ([b97ca36](https://github.com/n8n-io/n8n/commit/b97ca36a99d099288cfc127df98038b2b64c03d5))
+* **editor:** Make textarea resize handle accessible in NDV ([#29676](https://github.com/n8n-io/n8n/issues/29676)) ([9fda733](https://github.com/n8n-io/n8n/commit/9fda7332c4c0a8851a7482365a967ea18db2a816))
+* **editor:** Mark workflow dirty after debug pinData changes ([#28886](https://github.com/n8n-io/n8n/issues/28886)) ([2beb006](https://github.com/n8n-io/n8n/commit/2beb0062a5f92c883f18abaf9ea33590a41aca49))
+* **editor:** Never block publishing on node execution issues ([#29479](https://github.com/n8n-io/n8n/issues/29479)) ([5a56459](https://github.com/n8n-io/n8n/commit/5a564591291989f13ac667eed575332f7f4d2a6a))
+* **editor:** Polish encryption keys date range filter ([#29569](https://github.com/n8n-io/n8n/issues/29569)) ([56412bc](https://github.com/n8n-io/n8n/commit/56412bcce2ef1d364acdbe422f5c88762319bb22))
+* **editor:** Remove clipping for focus panel textarea ([#28677](https://github.com/n8n-io/n8n/issues/28677)) ([5361257](https://github.com/n8n-io/n8n/commit/5361257a80e515e1cc26cdf10e8ceb78c9ec70be))
+* **editor:** Restore read-only mode for archived workflows on canvas ([#29559](https://github.com/n8n-io/n8n/issues/29559)) ([a7ef741](https://github.com/n8n-io/n8n/commit/a7ef7416b111384d250f975e718c691b2674fef6))
+* **editor:** Show permission-aware message on redacted input/output panels ([#29521](https://github.com/n8n-io/n8n/issues/29521)) ([83c400e](https://github.com/n8n-io/n8n/commit/83c400e8d47c875f57dce26680358595822ce012))
+* **editor:** Surface unofficial verified community node tools in AI Tools picker ([#28985](https://github.com/n8n-io/n8n/issues/28985)) ([f77dfd1](https://github.com/n8n-io/n8n/commit/f77dfd1a11591124e6db61c72ed207067bae6214))
+* Fix ollama node url path and thinking tokens ([#23963](https://github.com/n8n-io/n8n/issues/23963)) ([4ea1153](https://github.com/n8n-io/n8n/commit/4ea1153dfb903346bead9e6d328ec8f543c80559))
+* **Google Drive Node:** Resolve original file name when copying with empty name ([#28896](https://github.com/n8n-io/n8n/issues/28896)) ([c274976](https://github.com/n8n-io/n8n/commit/c2749768aa5d173c3354e8d31a18c438ebd5fdfb))
+* **Merge Node:** Improve SQL Query mode memory efficiency and error reporting ([#28993](https://github.com/n8n-io/n8n/issues/28993)) ([12275c8](https://github.com/n8n-io/n8n/commit/12275c86d992115fef2ded4e5f172730222c5669))
+* **Microsoft Outlook Trigger Node:** Use per-folder endpoints for folder-scoped message polling ([#29663](https://github.com/n8n-io/n8n/issues/29663)) ([f401f91](https://github.com/n8n-io/n8n/commit/f401f9101d08fc62eef7e051f3baa23638c80c1b))
+* No Credits state for n8n Connect badge ([#29375](https://github.com/n8n-io/n8n/issues/29375)) ([47ad397](https://github.com/n8n-io/n8n/commit/47ad39777f9525324524f2595fc4506065f33a9c))
+* **Notion Node:** Support app.notion.com URL format for page and block ID extraction ([#29554](https://github.com/n8n-io/n8n/issues/29554)) ([221c7f7](https://github.com/n8n-io/n8n/commit/221c7f7410d25b89b052e89d745184675b69dc53))
+* **Postgres Node:** Output Large-Format Numbers As option ignored after pool is cached ([#29477](https://github.com/n8n-io/n8n/issues/29477)) ([a65e181](https://github.com/n8n-io/n8n/commit/a65e181a2213f1b984c225539302a1a12a30cc9b))
+* **Salesforce Node:** Allow overriding JWT audience with My Domain URL ([#29016](https://github.com/n8n-io/n8n/issues/29016)) ([9decb1e](https://github.com/n8n-io/n8n/commit/9decb1e2a9f6d6612014354d7ca6f8b62600ce9d))
+* **Schedule Node:** Cap day-of-month jitter at 28 ([#29614](https://github.com/n8n-io/n8n/issues/29614)) ([86f47ee](https://github.com/n8n-io/n8n/commit/86f47ee6dc88397b05bfb784b0092674ba3b4289))
+* Skip AI tool generation for community trigger nodes ([#29453](https://github.com/n8n-io/n8n/issues/29453)) ([c724dac](https://github.com/n8n-io/n8n/commit/c724dace38ec1e3aa69de40d48e068cf36c962b0))
+* **Snowflake Node:** Avoid call stack overflow on large result sets ([#29200](https://github.com/n8n-io/n8n/issues/29200)) ([b2ac67f](https://github.com/n8n-io/n8n/commit/b2ac67f15452c625d4dee146a040b6324cdfefbb))
+* **Telegram Trigger Node:** Drop pending updates when creating a new webhook ([#29103](https://github.com/n8n-io/n8n/issues/29103)) ([4358f1d](https://github.com/n8n-io/n8n/commit/4358f1d51c588e76d03aa677f9b7deabbbc1af9d))
+* **Todoist Node:** Migrate to Todoist unified API v1 endpoints ([#29532](https://github.com/n8n-io/n8n/issues/29532)) ([5799481](https://github.com/n8n-io/n8n/commit/5799481d1c3bf14806d11ba2928af4f7f88db29f))
+* Use explicit node references for AI memory session keys ([#29473](https://github.com/n8n-io/n8n/issues/29473)) ([139b803](https://github.com/n8n-io/n8n/commit/139b803daefca44fd66a92156867d77ccdffcc66))
+* Validate sql ([#24706](https://github.com/n8n-io/n8n/issues/24706)) ([47a6658](https://github.com/n8n-io/n8n/commit/47a6658b2d4cd2d4be5e59b0d61f9bd25b553007))
+* **Zammad Node:** Add To and CC fields for email articles ([#28860](https://github.com/n8n-io/n8n/issues/28860)) ([e04f027](https://github.com/n8n-io/n8n/commit/e04f027b5dd008eb0c9354d166c716a93cdc48b7))
+
+
+### Features
+
+* Add instance-level JWKS URI endpoint for JWE public key distribution ([#29498](https://github.com/n8n-io/n8n/issues/29498)) ([794334c](https://github.com/n8n-io/n8n/commit/794334cd79f1ee5a05cd0d818fc801920e0fe6d9))
+* Add no-runtime-dependencies ESLint rule ([#29366](https://github.com/n8n-io/n8n/issues/29366)) ([8aace75](https://github.com/n8n-io/n8n/commit/8aace75535f53ebf37c2a547849e044948c99cb8))
+* Add pairwise workflow eval pipeline ([#29123](https://github.com/n8n-io/n8n/issues/29123)) ([fdceec2](https://github.com/n8n-io/n8n/commit/fdceec21b996a1456ceb44389e760a80d75d49a1))
+* Add valid-credential-references ESLint rule ([#29452](https://github.com/n8n-io/n8n/issues/29452)) ([c6c6f8f](https://github.com/n8n-io/n8n/commit/c6c6f8ff3889a48ac73d5e5bb242e88818707fc0))
+* **core:** Add --include and --exclude flags to import:credentials command ([#29364](https://github.com/n8n-io/n8n/issues/29364)) ([f5132b9](https://github.com/n8n-io/n8n/commit/f5132b9e9abe23eb1a2b1225d889f1dd83d83f94))
+* **core:** Add configurable event log path per process ([#29403](https://github.com/n8n-io/n8n/issues/29403)) ([45effb8](https://github.com/n8n-io/n8n/commit/45effb8959e4013d46a022a5a3f901e9d0284d35))
+* **core:** Add endpoint to toggle mcp access for multiple workflows ([#29007](https://github.com/n8n-io/n8n/issues/29007)) ([0d907d6](https://github.com/n8n-io/n8n/commit/0d907d67945dfd9624eda6f3fb634cee4bd2d195))
+* **core:** Add JWE decryption to OAuth2 credential flow ([#29497](https://github.com/n8n-io/n8n/issues/29497)) ([ad7cdcc](https://github.com/n8n-io/n8n/commit/ad7cdcc04f47e1c34754636098ff698b7b153d05))
+* **core:** Add MCP tool search executions ([#29161](https://github.com/n8n-io/n8n/issues/29161)) ([1d9548c](https://github.com/n8n-io/n8n/commit/1d9548c81f6a984882aadd7091cd649967aa7201))
+* **core:** Add migration for postgres variable values ([#29489](https://github.com/n8n-io/n8n/issues/29489)) ([898ba5a](https://github.com/n8n-io/n8n/commit/898ba5ae2562542af11031b5dfdf0400afb91fbd))
+* **core:** Add preAuthentication support to requestOAuth2 pipeline ([#29418](https://github.com/n8n-io/n8n/issues/29418)) ([473d49c](https://github.com/n8n-io/n8n/commit/473d49c9b18ff4d8226f54fe0c5c8a2a1c6fdca5))
+* **core:** Bootstrap legacy CBC and initial GCM encryption keys on startup ([#29400](https://github.com/n8n-io/n8n/issues/29400)) ([9576ab9](https://github.com/n8n-io/n8n/commit/9576ab907cc3bdb560d1b40a1582ecf67c253d3a))
+* **core:** Broadcast workflow settings updates ([#29459](https://github.com/n8n-io/n8n/issues/29459)) ([9cb1605](https://github.com/n8n-io/n8n/commit/9cb160585c05ccb1770554cd0998ea4d9b0ab3cc))
+* **core:** Decouple insights pruning max age from license ([#29527](https://github.com/n8n-io/n8n/issues/29527)) ([45c18fb](https://github.com/n8n-io/n8n/commit/45c18fb09c04749063edc3545c38ad37006c0c49))
+* **core:** Fix user access control logic ([#29481](https://github.com/n8n-io/n8n/issues/29481)) ([484cb2e](https://github.com/n8n-io/n8n/commit/484cb2efba8b33555c4d34bb95680d16a3328c1e))
+* **core:** Manage MCP settings via environment variables ([#29368](https://github.com/n8n-io/n8n/issues/29368)) ([05e10e2](https://github.com/n8n-io/n8n/commit/05e10e268083fd7f9f1176634f0c1cab88297b94))
+* **core:** Run evaluation test cases in parallel behind PostHog rollout flag ([#29412](https://github.com/n8n-io/n8n/issues/29412)) ([4c76aa1](https://github.com/n8n-io/n8n/commit/4c76aa1467d08d5f188cf8b7716b52b410f2bd65))
+* **core:** Use versioned prebuilt Daytona snapshots for Instance AI sandboxes ([#29359](https://github.com/n8n-io/n8n/issues/29359)) ([308d0b4](https://github.com/n8n-io/n8n/commit/308d0b42b32a3372bac3a759b15ee410c9d095eb))
+* **core:** Warn and skip on duplicate scheduled executions ([#28649](https://github.com/n8n-io/n8n/issues/28649)) ([b8b7571](https://github.com/n8n-io/n8n/commit/b8b75719ba373a27f60c6f471b170216fe7c41a9))
+* **editor:** Add data encryption keys settings page ([#29068](https://github.com/n8n-io/n8n/issues/29068)) ([656f9c2](https://github.com/n8n-io/n8n/commit/656f9c2d7fc635c117efaeb40bb0fb98256f5ba3))
+* **editor:** Add environment variable to disable workflow autosave ([#25144](https://github.com/n8n-io/n8n/issues/25144)) ([a2afc47](https://github.com/n8n-io/n8n/commit/a2afc47c226a716b7ae059306e684748c9d65947))
+* **editor:** Add reveal redacted data permission to custom roles execution section ([#29526](https://github.com/n8n-io/n8n/issues/29526)) ([be22095](https://github.com/n8n-io/n8n/commit/be22095646c0daf2bbdc2afb7ebc4c1e4a50e349))
+* **editor:** Add transition on Sidebar collapsed ([#29650](https://github.com/n8n-io/n8n/issues/29650)) ([07b5343](https://github.com/n8n-io/n8n/commit/07b53430f9e9efefaa78d90d3a613d5518ede4e5))
+* **editor:** Hide model selector for unsupported AI Gateway actions ([#29588](https://github.com/n8n-io/n8n/issues/29588)) ([0f7776e](https://github.com/n8n-io/n8n/commit/0f7776e972c1d94d0f61d6d8855865802ef2a273))
+* **editor:** Move Switch component to core design system ([#27322](https://github.com/n8n-io/n8n/issues/27322)) ([758f89c](https://github.com/n8n-io/n8n/commit/758f89c9ef4b936e1904c244698ccb4d92f6dd51))
+* **editor:** Track IdP role mapping in provisioning telemetry ([#29416](https://github.com/n8n-io/n8n/issues/29416)) ([40da23f](https://github.com/n8n-io/n8n/commit/40da23f68899bc11240b252d417aa01dec8485a9))
+* **editor:** Update copy for mcp settings ([#29399](https://github.com/n8n-io/n8n/issues/29399)) ([5f93b48](https://github.com/n8n-io/n8n/commit/5f93b48e79067251e782940489848f81f897d3a4))
+* Include updatedAt in encryption key response DTO ([#29424](https://github.com/n8n-io/n8n/issues/29424)) ([569f94b](https://github.com/n8n-io/n8n/commit/569f94bb828bdd662bb291bd1d566e4e2a8ebdae))
+* **instance-ai:** Orchestrator-executed checkpoint tasks for planned workflow verification ([#29049](https://github.com/n8n-io/n8n/issues/29049)) ([ad359b5](https://github.com/n8n-io/n8n/commit/ad359b5e2ceaaf2ba04559e43117d81bc5f2df25))
+* **Netlify Trigger Node:** Add webhook request verification ([#29256](https://github.com/n8n-io/n8n/issues/29256)) ([1516ec7](https://github.com/n8n-io/n8n/commit/1516ec7c06ab797dbf94fd1b8a0322209e6ee0bc))
+* **Slack Node:** Allow users to configure OAuth2 scopes ([#28728](https://github.com/n8n-io/n8n/issues/28728)) ([aa0daf9](https://github.com/n8n-io/n8n/commit/aa0daf9fb630661d35e8bd006ed3b749051f7a7d))
+* Validate workflow-sdk output topology against mode ([#29363](https://github.com/n8n-io/n8n/issues/29363)) ([0a80722](https://github.com/n8n-io/n8n/commit/0a80722dcb3fcdbc23d9e768413b3141ec329adc))
+
+
 # [2.19.0](https://github.com/n8n-io/n8n/compare/n8n@2.18.0...n8n@2.19.0) (2026-04-28)


--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "n8n-monorepo",
-  "version": "2.19.0",
+  "version": "2.20.0",
  "private": true,
  "engines": {
    "node": ">=22.16",
--- a/packages/@n8n/ai-node-sdk/package.json
+++ b/packages/@n8n/ai-node-sdk/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/ai-node-sdk",
-  "version": "0.10.0",
+  "version": "0.11.0",
  "description": "SDK for building AI nodes in n8n",
  "types": "dist/esm/index.d.ts",
  "module": "dist/esm/index.js",
--- a/packages/@n8n/ai-utilities/package.json
+++ b/packages/@n8n/ai-utilities/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/ai-utilities",
-  "version": "0.13.0",
+  "version": "0.14.0",
  "description": "Utilities for building AI nodes in n8n",
  "types": "dist/esm/index.d.ts",
  "module": "dist/esm/index.js",
--- a/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/evaluations/evaluators/pairwise/index.ts
@ -11,6 +11,12 @@ import type {
 	Feedback,
 } from '../../harness/harness-types';

+// Re-exports so downstream consumers (e.g. instance-ai evals) can pull the
+// evaluator and its input/output types from a single module surface.
+export type { SimpleWorkflow };
+export type { EvaluationContext, Evaluator, Feedback };
+export { PAIRWISE_METRICS };
+
 /**
 * Options for creating a pairwise evaluator.
 */
--- a/packages/@n8n/ai-workflow-builder.ee/package.json
+++ b/packages/@n8n/ai-workflow-builder.ee/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/ai-workflow-builder",
-  "version": "1.19.0",
+  "version": "1.20.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "typecheck": "tsc --noEmit",
--- a/packages/@n8n/ai-workflow-builder.ee/src/code-builder/prompts/index.ts
+++ b/packages/@n8n/ai-workflow-builder.ee/src/code-builder/prompts/index.ts
@ -99,15 +99,16 @@ startTrigger.to(sourceA.to(sourceB.to(processResults)));
 // Pairs items by index, merging fields from both inputs into one item.
 // @example input0: [{ a: 1 }, { a: 2 }] input1: [{ b: 10, c: 'x' }, { b: 20 }]
 //   output: [{ a: 1, b: 10, c: 'x' }, { a: 2, b: 20, c: undefined }]
+// .input(n) is 0-based: .input(0) = first input, .input(1) = second input.
 const combineResults = merge({
  version: 3.2,
  config: { name: 'Combine Results', parameters: { mode: 'combine', combineBy: 'combineByPosition' } }
 });
 export default workflow('id', 'name')
  .add(startTrigger)
-  .to(sourceA.to(combineResults.input(0)))
+  .to(sourceA.to(combineResults.input(0))) // first input (index 0)
  .add(startTrigger)
-  .to(sourceB.to(combineResults.input(1)))
+  .to(sourceB.to(combineResults.input(1))) // second input (index 1)
  .add(combineResults)
  .to(processResults);

@ -121,9 +122,9 @@ const allResults = merge({
 });
 export default workflow('id', 'name')
  .add(startTrigger)
-  .to(sourceA.to(allResults.input(0)))
+  .to(sourceA.to(allResults.input(0))) // first input (index 0)
  .add(startTrigger)
-  .to(sourceB.to(allResults.input(1)))
+  .to(sourceB.to(allResults.input(1))) // second input (index 1)
  .add(allResults)
  .to(processResults);
 \`\`\`
@ -180,12 +181,13 @@ const branch1 = node({ type: 'n8n-nodes-base.httpRequest', ... });
 const branch2 = node({ type: 'n8n-nodes-base.httpRequest', ... });
 const processResults = node({ type: 'n8n-nodes-base.set', ... });

-// Connect branches to specific merge inputs using .input(n)
+// Connect branches to specific merge inputs using .input(n).
+// Indices are 0-based: .input(0) is the FIRST input, .input(1) is the SECOND.
 export default workflow('id', 'name')
  .add(trigger({ ... }))
-  .to(branch1.to(combineResults.input(0)))  // Connect to input 0
+  .to(branch1.to(combineResults.input(0)))  // first input (index 0)
  .add(trigger({ ... }))
-  .to(branch2.to(combineResults.input(1)))  // Connect to input 1
+  .to(branch2.to(combineResults.input(1)))  // second input (index 1)
  .add(combineResults)
  .to(processResults);  // Process merged results
 \`\`\`
--- a/packages/@n8n/api-types/package.json
+++ b/packages/@n8n/api-types/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/api-types",
-  "version": "1.19.0",
+  "version": "1.20.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/api-types/src/dto/ai/ai-gateway-config-response.dto.ts
+++ b/packages/@n8n/api-types/src/dto/ai/ai-gateway-config-response.dto.ts
@ -14,4 +14,5 @@ export class AiGatewayConfigDto extends Z.class({
 	nodes: z.array(z.string()),
 	credentialTypes: z.array(z.string()),
 	providerConfig: z.record(z.object(aiGatewayProviderConfigEntryShape)),
+	supportedActions: z.record(z.record(z.array(z.string()))).optional(),
 }) {}
--- a/packages/@n8n/api-types/src/dto/index.ts
+++ b/packages/@n8n/api-types/src/dto/index.ts
@ -25,7 +25,11 @@ export type {
 	AiGatewayUsageResponse,
 } from './ai/ai-gateway-usage-response.dto';

-export { InstanceAiConfirmRequestDto } from './instance-ai/instance-ai-confirm-request.dto';
+export {
+	InstanceAiConfirmRequestDto,
+	type InstanceAiConfirmRequest,
+	type InstanceAiConfirmRequestKind,
+} from './instance-ai/instance-ai-confirm-request.dto';
 export { InstanceAiFeedbackRequestDto } from './instance-ai/instance-ai-feedback-request.dto';
 export { InstanceAiRenameThreadRequestDto } from './instance-ai/instance-ai-rename-thread-request.dto';

@ -198,8 +202,14 @@ export {
 	OAuthClientResponseDto,
 	ListOAuthClientsResponseDto,
 	DeleteOAuthClientResponseDto,
+	InstanceMcpClientStatsResponseDto,
 } from './oauth/oauth-client.dto';
-export { ProvisioningConfigDto, ProvisioningConfigPatchDto } from './provisioning/config.dto';
+export {
+	ProvisioningConfigDto,
+	ProvisioningConfigPatchDto,
+	type ProvisioningMode,
+	type ProvisioningModeFlags,
+} from './provisioning/config.dto';

 export {
 	SecuritySettingsDto,
--- a/packages/@n8n/api-types/src/dto/instance-ai/tests/instance-ai-confirm-request.dto.test.ts
+++ b/packages/@n8n/api-types/src/dto/instance-ai/tests/instance-ai-confirm-request.dto.test.ts
@ -0,0 +1,151 @@
+import {
+	InstanceAiConfirmRequestDto,
+	type InstanceAiConfirmRequest,
+} from '../instance-ai-confirm-request.dto';
+
+/**
+ * The shapes in this file mirror what each frontend call site sends (see
+ * components/ and composables/useSetupActions.ts in editor-ui). If a call site
+ * changes the body shape, its branch here must change too — that is the whole
+ * point of keeping this test.
+ */
+
+describe('InstanceAiConfirmRequestDto', () => {
+	describe('accepts each frontend-sent payload shape', () => {
+		const cases: Array<[label: string, payload: InstanceAiConfirmRequest]> = [
+			// InstanceAiConfirmationPanel: handleConfirm / handleApproveAll / handlePlanApprove / handleTextSkip
+			['approval approve (no input)', { kind: 'approval', approved: true }],
+			['approval deny (no input)', { kind: 'approval', approved: false }],
+			// InstanceAiConfirmationPanel: handleTextSubmit
+			[
+				'approval with userInput (text submit)',
+				{ kind: 'approval', approved: true, userInput: 'some typed answer' },
+			],
+			// InstanceAiConfirmationPanel: handlePlanRequestChanges + AgentTimeline feedback
+			[
+				'approval deny with userInput (plan feedback)',
+				{ kind: 'approval', approved: false, userInput: 'please revise step 3' },
+			],
+			// InstanceAiConfirmationPanel: handleQuestionsSubmit
+			[
+				'questions with mixed answers',
+				{
+					kind: 'questions',
+					answers: [
+						{ questionId: 'q1', selectedOptions: ['opt-a'] },
+						{ questionId: 'q2', selectedOptions: ['opt-b', 'opt-c'], customText: 'extra' },
+						{ questionId: 'q3', selectedOptions: [], skipped: true },
+					],
+				},
+			],
+			// InstanceAiCredentialSetup: handleContinue
+			[
+				'credentialSelection with credential map',
+				{
+					kind: 'credentialSelection',
+					credentials: { slackApi: 'cred-1', githubApi: 'cred-2' },
+				},
+			],
+			// DomainAccessApproval: handleAction (primary path — with action)
+			[
+				'domainAccessApprove with allow_domain',
+				{ kind: 'domainAccessApprove', domainAccessAction: 'allow_domain' },
+			],
+			[
+				'domainAccessApprove with allow_once',
+				{ kind: 'domainAccessApprove', domainAccessAction: 'allow_once' },
+			],
+			[
+				'domainAccessApprove with allow_all',
+				{ kind: 'domainAccessApprove', domainAccessAction: 'allow_all' },
+			],
+			// DomainAccessApproval: handleAction (deny path)
+			['domainAccessDeny', { kind: 'domainAccessDeny' }],
+			// confirmResourceDecision (store)
+			[
+				'resourceDecision with arbitrary decision token',
+				{ kind: 'resourceDecision', resourceDecision: 'allowForSession' },
+			],
+			// useSetupActions: handleApply
+			[
+				'setupWorkflowApply (full payload)',
+				{
+					kind: 'setupWorkflowApply',
+					nodeCredentials: {
+						'Slack Node': { slackApi: 'cred-1' },
+						'GitHub Node': { githubApi: 'cred-2' },
+					},
+					nodeParameters: {
+						'Slack Node': { channel: '#general' },
+					},
+				},
+			],
+			['setupWorkflowApply (no node credentials)', { kind: 'setupWorkflowApply' }],
+			// useSetupActions: handleTestTrigger
+			[
+				'setupWorkflowTestTrigger (with node credentials)',
+				{
+					kind: 'setupWorkflowTestTrigger',
+					testTriggerNode: 'Webhook',
+					nodeCredentials: { Webhook: { httpHeaderAuth: 'cred-3' } },
+					nodeParameters: { Webhook: { path: '/trigger' } },
+				},
+			],
+			[
+				'setupWorkflowTestTrigger (minimal)',
+				{ kind: 'setupWorkflowTestTrigger', testTriggerNode: 'Webhook' },
+			],
+		];
+
+		test.each(cases)('%s', (_label, payload) => {
+			const result = InstanceAiConfirmRequestDto.safeParse(payload);
+			expect(result.success).toBe(true);
+			if (result.success) expect(result.data).toEqual(payload);
+		});
+	});
+
+	describe('rejects invalid payloads', () => {
+		test('missing kind discriminator', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({ approved: true });
+			expect(result.success).toBe(false);
+		});
+
+		test('unknown kind', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({ kind: 'bogus', approved: true });
+			expect(result.success).toBe(false);
+		});
+
+		test('questions without answers array', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({ kind: 'questions' });
+			expect(result.success).toBe(false);
+		});
+
+		test('credentialSelection without credentials map', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({ kind: 'credentialSelection' });
+			expect(result.success).toBe(false);
+		});
+
+		test('resourceDecision without decision', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({ kind: 'resourceDecision' });
+			expect(result.success).toBe(false);
+		});
+
+		test('setupWorkflowTestTrigger without testTriggerNode', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({ kind: 'setupWorkflowTestTrigger' });
+			expect(result.success).toBe(false);
+		});
+
+		test('domainAccessAction must be a known value', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({
+				kind: 'domainAccessApprove',
+				domainAccessAction: 'allow_never',
+			});
+			expect(result.success).toBe(false);
+		});
+
+		test('domainAccessApprove without domainAccessAction', () => {
+			const result = InstanceAiConfirmRequestDto.safeParse({ kind: 'domainAccessApprove' });
+			expect(result.success).toBe(false);
+		});
+	});
+});
--- a/packages/@n8n/api-types/src/dto/instance-ai/instance-ai-confirm-request.dto.ts
+++ b/packages/@n8n/api-types/src/dto/instance-ai/instance-ai-confirm-request.dto.ts
@ -1,28 +1,92 @@
 import { z } from 'zod';

 import { domainAccessActionSchema } from '../../schemas/instance-ai.schema';
-import { Z } from '../../zod-class';

-export class InstanceAiConfirmRequestDto extends Z.class({
+/**
+ * Plain approval/denial. Also carries optional `userInput` for:
+ *   - text-input confirmations (inputType='text')
+ *   - plan-review feedback accompanying approve/request-changes
+ *   - deferring/skipping credential or workflow setup wizards (`approved: false`)
+ */
+const approvalConfirmSchema = z.object({
+	kind: z.literal('approval'),
 	approved: z.boolean(),
-	credentialId: z.string().optional(),
-	credentials: z.record(z.string()).optional(),
-	nodeCredentials: z.record(z.record(z.string())).optional(),
-	autoSetup: z.object({ credentialType: z.string() }).optional(),
 	userInput: z.string().optional(),
-	domainAccessAction: domainAccessActionSchema.optional(),
-	action: z.enum(['apply', 'test-trigger']).optional(),
-	nodeParameters: z.record(z.record(z.unknown())).optional(),
-	testTriggerNode: z.string().optional(),
-	answers: z
-		.array(
-			z.object({
-				questionId: z.string(),
-				selectedOptions: z.array(z.string()),
-				customText: z.string().optional(),
-				skipped: z.boolean().optional(),
-			}),
-		)
-		.optional(),
-	resourceDecision: z.string().optional(),
-}) {}
+});
+
+/** Q&A wizard submission (inputType='questions'). */
+const questionsConfirmSchema = z.object({
+	kind: z.literal('questions'),
+	answers: z.array(
+		z.object({
+			questionId: z.string(),
+			selectedOptions: z.array(z.string()),
+			customText: z.string().optional(),
+			skipped: z.boolean().optional(),
+		}),
+	),
+});
+
+/** Map of credential type → credential ID (e.g. `{ slackApi: 'cred-1', githubApi: 'cred-2' }`). */
+const credentialIdByTypeSchema = z.record(z.string());
+
+const credentialSelectionConfirmSchema = z.object({
+	kind: z.literal('credentialSelection'),
+	credentials: credentialIdByTypeSchema,
+});
+
+/** Domain-access approval — `domainAccessAction` carries which scope the user picked. */
+const domainAccessApproveSchema = z.object({
+	kind: z.literal('domainAccessApprove'),
+	domainAccessAction: domainAccessActionSchema,
+});
+
+/** Domain-access denial — no further input. */
+const domainAccessDenySchema = z.object({
+	kind: z.literal('domainAccessDeny'),
+});
+
+/** Gateway resource-access decision (inputType='resource-decision'). Approval is implied.
+ *  `resourceDecision` is one of the opaque tokens listed in the request's `options[]` array
+ *  (e.g. `'denyOnce'`, `'allowOnce'`, `'allowForSession'`) — the daemon defines the vocabulary,
+ *  so we keep this as a string rather than a fixed enum. */
+const resourceDecisionConfirmSchema = z.object({
+	kind: z.literal('resourceDecision'),
+	resourceDecision: z.string(),
+});
+
+/** Per-node credential map: `Record<nodeName, Record<credentialType, credentialId>>`. */
+const nodeCredentialsRecord = z.record(credentialIdByTypeSchema).optional();
+/** Per-node parameter map: `Record<nodeName, Record<paramName, value>>`. */
+const nodeParametersRecord = z.record(z.record(z.unknown())).optional();
+
+/** Workflow-setup wizard: apply the chosen credentials/parameters. Approval is implied;
+ *  the service maps this to `action: 'apply'` for the underlying Mastra resume schema. */
+const setupWorkflowApplyConfirmSchema = z.object({
+	kind: z.literal('setupWorkflowApply'),
+	nodeCredentials: nodeCredentialsRecord,
+	nodeParameters: nodeParametersRecord,
+});
+
+/** Workflow-setup wizard: run a test-trigger against a specific node. Approval is implied;
+ *  the service maps this to `action: 'test-trigger'` for the underlying Mastra resume schema. */
+const setupWorkflowTestTriggerConfirmSchema = z.object({
+	kind: z.literal('setupWorkflowTestTrigger'),
+	testTriggerNode: z.string(),
+	nodeCredentials: nodeCredentialsRecord,
+	nodeParameters: nodeParametersRecord,
+});
+
+export const InstanceAiConfirmRequestDto = z.discriminatedUnion('kind', [
+	approvalConfirmSchema,
+	questionsConfirmSchema,
+	credentialSelectionConfirmSchema,
+	domainAccessApproveSchema,
+	domainAccessDenySchema,
+	resourceDecisionConfirmSchema,
+	setupWorkflowApplyConfirmSchema,
+	setupWorkflowTestTriggerConfirmSchema,
+]);
+
+export type InstanceAiConfirmRequest = z.infer<typeof InstanceAiConfirmRequestDto>;
+export type InstanceAiConfirmRequestKind = InstanceAiConfirmRequest['kind'];
--- a/packages/@n8n/api-types/src/dto/oauth/oauth-client.dto.ts
+++ b/packages/@n8n/api-types/src/dto/oauth/oauth-client.dto.ts
@ -40,3 +40,12 @@ export class DeleteOAuthClientResponseDto extends Z.class({
 	success: z.boolean(),
 	message: z.string(),
 }) {}
+
+/**
+ * DTO for instance-wide MCP OAuth client capacity stats (admin-only)
+ */
+export class InstanceMcpClientStatsResponseDto extends Z.class({
+	count: z.number(),
+	limit: z.number(),
+	atCapacity: z.boolean(),
+}) {}
--- a/packages/@n8n/api-types/src/dto/provisioning/config.dto.ts
+++ b/packages/@n8n/api-types/src/dto/provisioning/config.dto.ts
@ -20,3 +20,14 @@ export class ProvisioningConfigPatchDto extends Z.class({
 	scopesUseExpressionMapping: z.boolean().optional().nullable(),
 	deleteProjectRules: z.boolean().optional(),
 }) {}
+
+export type ProvisioningMode =
+	| 'disabled'
+	| 'instance_role'
+	| 'instance_and_project_roles'
+	| 'expression_based';
+
+export type ProvisioningModeFlags = Pick<
+	ProvisioningConfigDto,
+	'scopesProvisionInstanceRole' | 'scopesProvisionProjectRoles' | 'scopesUseExpressionMapping'
+>;
--- a/packages/@n8n/api-types/src/frontend-settings.ts
+++ b/packages/@n8n/api-types/src/frontend-settings.ts
@ -160,6 +160,7 @@ export interface FrontendSettings {
 		};
 	};
 	workflowTagsDisabled: boolean;
+	workflowsAutosaveDisabled: boolean;
 	logLevel: LogLevel;
 	hiringBannerEnabled: boolean;
 	previewMode: boolean;
--- a/packages/@n8n/api-types/src/index.ts
+++ b/packages/@n8n/api-types/src/index.ts
@ -276,6 +276,7 @@ export {
 	toolResultPayloadSchema,
 	toolErrorPayloadSchema,
 	confirmationRequestPayloadSchema,
+	confirmationInputTypeSchema,
 	credentialRequestSchema,
 	workflowSetupNodeSchema,
 	errorPayloadSchema,
@ -284,6 +285,7 @@ export {
 	mcpToolCallRequestSchema,
 	mcpToolCallResultSchema,
 	getRenderHint,
+	isDisplayableConfirmationRequest,
 	isSafeObjectKey,
 	DEFAULT_INSTANCE_AI_PERMISSIONS,
 	UNLIMITED_CREDITS,
@ -316,6 +318,8 @@ export type {
 	InstanceAiEventType,
 	InstanceAiRunStatus,
 	InstanceAiConfirmation,
+	InstanceAiConfirmationInputType,
+	InstanceAiConfirmationRequestPayload,
 	InstanceAiConfirmationSeverity,
 	InstanceAiCredentialRequest,
 	InstanceAiAgentStatus,
@ -343,7 +347,6 @@ export type {
 	InstanceAiEvent,
 	InstanceAiAttachment,
 	InstanceAiSendMessageResponse,
-	InstanceAiConfirmResponse,
 	InstanceAiToolCallState,
 	InstanceAiAgentNode,
 	InstanceAiTimelineEntry,
@ -389,6 +392,13 @@ export {

 export type { AgentRunState, AgentNode } from './schemas/agent-run-reducer';

+export {
+	EVAL_PARALLEL_EXECUTION_FLAG,
+	startTestRunPayloadSchema,
+	StartTestRunRequestDto,
+	type StartTestRunPayload,
+} from './schemas/evaluations.schema';
+
 export { ALLOWED_DOMAINS, isAllowedDomain } from './utils/allowed-domains';

 export {
--- a/packages/@n8n/api-types/src/schemas/tests/instance-ai.schema.test.ts
+++ b/packages/@n8n/api-types/src/schemas/tests/instance-ai.schema.test.ts
@ -1,6 +1,9 @@
 import {
 	applyBranchReadOnlyOverrides,
 	DEFAULT_INSTANCE_AI_PERMISSIONS,
+	isDisplayableConfirmationRequest,
+	type InstanceAiConfirmationInputType,
+	type InstanceAiConfirmationRequestPayload,
 	type InstanceAiPermissions,
 } from '../instance-ai.schema';

@ -53,3 +56,178 @@ describe('applyBranchReadOnlyOverrides', () => {
 		expect(original.createWorkflow).toBe('require_approval');
 	});
 });
+
+function makeConfirmation(
+	overrides: Partial<InstanceAiConfirmationRequestPayload> = {},
+): InstanceAiConfirmationRequestPayload {
+	return {
+		requestId: 'req-1',
+		toolCallId: 'tc-1',
+		toolName: 'tool',
+		args: {},
+		severity: 'info',
+		message: 'Please approve',
+		...overrides,
+	};
+}
+
+describe('isDisplayableConfirmationRequest', () => {
+	it('treats approval and text messages as displayable', () => {
+		expect(isDisplayableConfirmationRequest(makeConfirmation({ inputType: 'approval' }))).toBe(
+			true,
+		);
+		expect(isDisplayableConfirmationRequest(makeConfirmation({ inputType: 'text' }))).toBe(true);
+	});
+
+	it('does not treat metadata-only approval prompts as displayable', () => {
+		expect(isDisplayableConfirmationRequest(makeConfirmation({ message: '   ' }))).toBe(false);
+	});
+
+	it('does not treat intro-only questions prompts as displayable', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'questions',
+					message: '',
+					introMessage: 'A little context before the questions',
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('recognizes typed display variants', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'questions',
+					message: '',
+					questions: [{ id: 'q1', question: 'Pick one', type: 'single', options: ['A'] }],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					planItems: [{ id: 'task-1', title: 'Task', kind: 'delegate', spec: 'Do it', deps: [] }],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'resource-decision',
+					message: '',
+					resourceDecision: {
+						toolGroup: 'filesystem',
+						resource: '/tmp',
+						description: 'Access /tmp',
+						options: ['allowForSession'],
+					},
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					setupRequests: [
+						{
+							node: {
+								name: 'Webhook',
+								type: 'n8n-nodes-base.webhook',
+								typeVersion: 1,
+								parameters: {},
+								position: [0, 0],
+								id: 'node-1',
+							},
+							isTrigger: true,
+						},
+					],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					credentialRequests: [
+						{ credentialType: 'httpBasicAuth', reason: 'Required', existingCredentials: [] },
+					],
+				}),
+			),
+		).toBe(true);
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					domainAccess: { url: 'https://example.com', host: 'example.com' },
+				}),
+			),
+		).toBe(true);
+	});
+
+	it('does not treat credential flow metadata as displayable on its own', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					message: '',
+					credentialFlow: { stage: 'finalize' },
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('does not treat lightweight task lists as displayable plan reviews', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					tasks: {
+						tasks: [{ id: 'task-1', description: 'Do it', status: 'todo' }],
+					},
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('recognizes only renderable task args for plan reviews', () => {
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					args: {
+						tasks: [{ id: 'task-1', title: 'Task', kind: 'delegate', spec: 'Do it', deps: [] }],
+					},
+				}),
+			),
+		).toBe(true);
+
+		expect(
+			isDisplayableConfirmationRequest(
+				makeConfirmation({
+					inputType: 'plan-review',
+					message: 'Ignored for displayability',
+					args: {
+						tasks: [{ id: 'task-1', description: 'Do it', status: 'todo' }],
+					},
+				}),
+			),
+		).toBe(false);
+	});
+
+	it('keeps the input type switch exhaustive', () => {
+		const handled = {
+			approval: true,
+			text: true,
+			questions: true,
+			'plan-review': true,
+			'resource-decision': true,
+		} satisfies Record<InstanceAiConfirmationInputType, true>;
+
+		expect(Object.keys(handled)).toHaveLength(5);
+	});
+});
--- a/packages/@n8n/api-types/src/schemas/evaluations.schema.ts
+++ b/packages/@n8n/api-types/src/schemas/evaluations.schema.ts
@ -0,0 +1,26 @@
+import { z } from 'zod';
+
+import { Z } from '../zod-class';
+
+// Single source of truth for the parallel-execution rollout flag id, shared
+// between the FE checkbox-rendering gate and the BE controller's safety net.
+// Strings can drift if duplicated; importing from a shared module cannot.
+export const EVAL_PARALLEL_EXECUTION_FLAG = '080_eval_parallel_execution';
+
+// `concurrency` is the optional number of evaluation test cases to run in
+// parallel for a single test run. Clamped 1–10. When omitted, the runner
+// falls back to sequential execution (concurrency = 1). The PostHog
+// rollout flag `080_eval_parallel_execution` gates whether the controller
+// honours values > 1; flag-off requests are silently coerced to 1 so the
+// flag id never leaks into HTTP responses.
+const startTestRunPayloadShape = {
+	concurrency: z.number().int().min(1).max(10).optional(),
+};
+
+export const startTestRunPayloadSchema = z.object(startTestRunPayloadShape);
+export type StartTestRunPayload = z.infer<typeof startTestRunPayloadSchema>;
+
+// Controller-side DTO used by the @Body decorator's reflection-based
+// validation. Shares the same shape as `startTestRunPayloadSchema` —
+// single source of truth so the two validators cannot silently diverge.
+export class StartTestRunRequestDto extends Z.class(startTestRunPayloadShape) {}
--- a/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
+++ b/packages/@n8n/api-types/src/schemas/instance-ai.schema.ts
@ -297,6 +297,15 @@ export type GatewayConfirmationRequiredPayload = z.infer<

 // ---------------------------------------------------------------------------

+export const confirmationInputTypeSchema = z.enum([
+	'approval',
+	'text',
+	'questions',
+	'plan-review',
+	'resource-decision',
+]);
+export type InstanceAiConfirmationInputType = z.infer<typeof confirmationInputTypeSchema>;
+
 export const confirmationRequestPayloadSchema = z.object({
 	requestId: z.string(),
 	inputThreadId: z
@ -315,8 +324,7 @@ export const confirmationRequestPayloadSchema = z.object({
 		.describe(
 			'Target project ID — used to scope actions (e.g. credential creation) to the correct project',
 		),
-	inputType: z
-		.enum(['approval', 'text', 'questions', 'plan-review', 'resource-decision'])
+	inputType: confirmationInputTypeSchema
 		.optional()
 		.describe(
 			'UI mode: approval (default) shows approve/deny, text shows a text input, ' +
@ -359,6 +367,53 @@ export const confirmationRequestPayloadSchema = z.object({
 		.optional()
 		.describe('Gateway resource-access decision data (inputType=resource-decision)'),
 });
+export type InstanceAiConfirmationRequestPayload = z.infer<typeof confirmationRequestPayloadSchema>;
+
+function isNonEmptyString(value: unknown): value is string {
+	return typeof value === 'string' && value.trim().length > 0;
+}
+
+function hasItems<T>(items: T[] | undefined): items is [T, ...T[]] {
+	return Array.isArray(items) && items.length > 0;
+}
+
+function argsContainPlannedTasks(args: Record<string, unknown>): boolean {
+	const tasks = args.tasks;
+	if (!Array.isArray(tasks)) return false;
+
+	return tasks.some((task) => plannedTaskArgSchema.safeParse(task).success);
+}
+
+function assertNever(value: never): never {
+	throw new Error(`Unhandled confirmation input type: ${String(value)}`);
+}
+
+/**
+ * True when the current frontend has enough typed confirmation payload to show
+ * a meaningful waiting-for-user UI. Correlation metadata alone must not count.
+ */
+export function isDisplayableConfirmationRequest(
+	payload: InstanceAiConfirmationRequestPayload,
+): boolean {
+	if (hasItems(payload.setupRequests)) return true;
+	if (hasItems(payload.credentialRequests)) return true;
+	if (payload.domainAccess) return true;
+
+	const inputType = payload.inputType ?? 'approval';
+	switch (inputType) {
+		case 'approval':
+		case 'text':
+			return isNonEmptyString(payload.message);
+		case 'questions':
+			return hasItems(payload.questions);
+		case 'plan-review':
+			return hasItems(payload.planItems) || argsContainPlannedTasks(payload.args);
+		case 'resource-decision':
+			return payload.resourceDecision !== undefined;
+		default:
+			return assertNever(inputType);
+	}
+}

 export const statusPayloadSchema = z.object({
 	message: z.string().describe('Transient status message. Empty string clears the indicator.'),
@ -601,28 +656,6 @@ export interface InstanceAiSendMessageResponse {
 	runId: string;
 }

-export interface InstanceAiConfirmResponse {
-	approved: boolean;
-	credentialId?: string;
-	credentials?: Record<string, string>;
-	/** Per-node credential assignments: `{ nodeName: { credType: credId } }`.
-	 *  Preferred over `credentials` when present — enables card-scoped selection. */
-	nodeCredentials?: Record<string, Record<string, string>>;
-	autoSetup?: { credentialType: string };
-	userInput?: string;
-	domainAccessAction?: DomainAccessAction;
-	resourceDecision?: string;
-	action?: 'apply' | 'test-trigger';
-	nodeParameters?: Record<string, Record<string, unknown>>;
-	testTriggerNode?: string;
-	answers?: Array<{
-		questionId: string;
-		selectedOptions: string[];
-		customText?: string;
-		skipped?: boolean;
-	}>;
-}
-
 // ---------------------------------------------------------------------------
 // Frontend store types (shared so both sides agree on structure)
 // ---------------------------------------------------------------------------
@ -735,6 +768,7 @@ export interface InstanceAiThreadSummary {
 	id: string;
 	title: string;
 	createdAt: string;
+	updatedAt: string;
 	metadata?: Record<string, unknown>;
 }

--- a/packages/@n8n/backend-common/package.json
+++ b/packages/@n8n/backend-common/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/backend-common",
-  "version": "1.19.0",
+  "version": "1.20.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/backend-test-utils/package.json
+++ b/packages/@n8n/backend-test-utils/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/backend-test-utils",
-  "version": "1.19.0",
+  "version": "1.20.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/chat-hub/package.json
+++ b/packages/@n8n/chat-hub/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/chat-hub",
-  "version": "1.12.0",
+  "version": "1.13.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/client-oauth2/package.json
+++ b/packages/@n8n/client-oauth2/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/client-oauth2",
-  "version": "1.3.0",
+  "version": "1.4.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/client-oauth2/src/types.ts
+++ b/packages/@n8n/client-oauth2/src/types.ts
@ -22,6 +22,7 @@ export interface OAuth2CredentialData {
 	};
 	useDynamicClientRegistration?: boolean;
 	serverUrl?: string;
+	jweEnabled?: boolean;
 }

 /**
--- a/packages/@n8n/config/package.json
+++ b/packages/@n8n/config/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/config",
-  "version": "2.18.0",
+  "version": "2.19.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/config/src/configs/endpoints.config.ts
+++ b/packages/@n8n/config/src/configs/endpoints.config.ts
@ -136,7 +136,7 @@ export class EndpointsConfig {

 	/** Maximum number of OAuth clients that can be registered for MCP. */
 	@Env('N8N_MCP_MAX_REGISTERED_CLIENTS')
-	mcpMaxRegisteredClients: number = 200;
+	mcpMaxRegisteredClients: number = 5000;

 	/** Whether to disable n8n's UI (frontend). */
 	@Env('N8N_DISABLE_UI')
--- a/packages/@n8n/config/src/configs/evaluation.config.ts
+++ b/packages/@n8n/config/src/configs/evaluation.config.ts
@ -0,0 +1,25 @@
+import { Config, Env } from '../decorators';
+
+@Config
+export class EvaluationConfig {
+	/**
+	 * Force-enable the parallel-execution feature for evaluation test runs.
+	 *
+	 * Acts as an operator-level override of the `080_eval_parallel_execution`
+	 * PostHog rollout flag. When set to `true`, the FE renders the
+	 * concurrency UI for every user and the BE honours `concurrency` payloads
+	 * regardless of PostHog cohort. When `false` (default), PostHog remains
+	 * the source of truth — the rollout flag controls visibility per-user.
+	 *
+	 * Useful for:
+	 * - Local development without PostHog wiring.
+	 * - Operator escape hatch if PostHog is unreachable.
+	 * - Self-hosted deployments that want the feature without PostHog
+	 *   dependency.
+	 *
+	 * Cannot force-disable: setting this to `false` falls back to PostHog,
+	 * not a kill-switch. Use PostHog itself to disable a rolled-out flag.
+	 */
+	@Env('N8N_EVAL_PARALLEL_EXECUTION_ENABLED')
+	parallelExecutionEnabled: boolean = false;
+}
--- a/packages/@n8n/config/src/configs/event-bus.config.ts
+++ b/packages/@n8n/config/src/configs/event-bus.config.ts
@ -16,6 +16,19 @@ class LogWriterConfig {
 	@Env('N8N_EVENTBUS_LOGWRITER_LOGBASENAME')
 	logBaseName: string = 'n8nEventLog';

+	/**
+	 * Absolute path to the primary event log file (must end in `.log`). When set,
+	 * used verbatim with no process-type suffix; the operator owns per-pod
+	 * uniqueness (e.g. via Kubernetes Downward API or per-pod PVC). Parent
+	 * directory is auto-created. Rotation siblings (e.g. `myEventLog-1.log`,
+	 * `myEventLog-2.log`, …, derived from the configured `logFullPath`) and the
+	 * `.recoveryInProgress` marker colocate with this path. Empty (default)
+	 * preserves the legacy `${N8N_USER_FOLDER}/n8nEventLog[-worker|-webhook-processor]`
+	 * behavior.
+	 */
+	@Env('N8N_EVENTBUS_LOGWRITER_LOGFULLPATH')
+	logFullPath: string = '';
+
 	/**
 	 * Safety tripwire: per-file cap on concurrently unconfirmed messages held in memory
 	 * during startup log parsing. Aborts the file if exceeded, to prevent OOM on legacy
--- a/packages/@n8n/config/src/configs/instance-ai.config.ts
+++ b/packages/@n8n/config/src/configs/instance-ai.config.ts
@ -82,6 +82,10 @@ export class InstanceAiConfig {
 	@Env('N8N_INSTANCE_AI_SANDBOX_TIMEOUT')
 	sandboxTimeout: number = 300_000;

+	/** How long to keep completed workflow-builder sandboxes warm for follow-up fixes. 0 = disabled. */
+	@Env('N8N_INSTANCE_AI_BUILDER_SANDBOX_TTL_MS')
+	builderSandboxTtlMs: number = 10 * 60 * 1000;
+
 	/** Brave Search API key for web search. No key = search + research agent disabled. */
 	@Env('INSTANCE_AI_BRAVE_SEARCH_API_KEY')
 	braveSearchApiKey: string = '';
--- a/packages/@n8n/config/src/configs/multi-main-setup.config.ts
+++ b/packages/@n8n/config/src/configs/multi-main-setup.config.ts
@ -13,4 +13,8 @@ export class MultiMainSetupConfig {
 	/** Interval in seconds between leader eligibility checks in multi-main setup. */
 	@Env('N8N_MULTI_MAIN_SETUP_CHECK_INTERVAL')
 	interval: number = 3;
+
+	/** Whether to use the new leader election implementation (Lua-script based). */
+	@Env('N8N_NEW_LEADER_ELECTION_IMPLEMENTATION')
+	newLeaderElection: boolean = false;
 }
--- a/packages/@n8n/config/src/configs/workflows.config.ts
+++ b/packages/@n8n/config/src/configs/workflows.config.ts
@ -30,4 +30,8 @@ export class WorkflowsConfig {
 	/** Whether to use the workflow publication service. Still under development. */
 	@Env('N8N_USE_WORKFLOW_PUBLICATION_SERVICE')
 	useWorkflowPublicationService: boolean = false;
+
+	/** Whether to disable automatic workflow saving in the editor */
+	@Env('N8N_WORKFLOWS_AUTOSAVE_DISABLED')
+	autosaveDisabled: boolean = false;
 }
--- a/packages/@n8n/config/src/index.ts
+++ b/packages/@n8n/config/src/index.ts
@ -14,6 +14,7 @@ import { DeploymentConfig } from './configs/deployment.config';
 import { DiagnosticsConfig } from './configs/diagnostics.config';
 import { DynamicBannersConfig } from './configs/dynamic-banners.config';
 import { EndpointsConfig } from './configs/endpoints.config';
+import { EvaluationConfig } from './configs/evaluation.config';
 import { EventBusConfig } from './configs/event-bus.config';
 import { ExecutionsConfig } from './configs/executions.config';
 import { ExpressionEngineConfig } from './configs/expression-engine.config';
@ -77,6 +78,7 @@ export { ChatTriggerConfig } from './configs/chat-trigger.config';
 export { InstanceAiConfig } from './configs/instance-ai.config';
 export { ExpressionEngineConfig } from './configs/expression-engine.config';
 export { PasswordConfig } from './configs/password.config';
+export { RedisConfig } from './configs/redis.config';

 const protocolSchema = z.enum(['http', 'https']);

@ -161,6 +163,9 @@ export class GlobalConfig {
 	@Nested
 	multiMainSetup: MultiMainSetupConfig;

+	@Nested
+	evaluation: EvaluationConfig;
+
 	@Nested
 	generic: GenericConfig;

--- a/packages/@n8n/config/test/config.test.ts
+++ b/packages/@n8n/config/test/config.test.ts
@ -161,6 +161,7 @@ describe('GlobalConfig', () => {
 			logWriter: {
 				keepLogCount: 3,
 				logBaseName: 'n8nEventLog',
+				logFullPath: '',
 				maxFileSizeInKB: 10240,
 				maxMessagesPerParse: 10_000,
 				maxTotalMessagesPerFile: 500_000,
@ -204,6 +205,7 @@ describe('GlobalConfig', () => {
 			indexingEnabled: true,
 			indexingBatchSize: 10,
 			useWorkflowPublicationService: false,
+			autosaveDisabled: false,
 		},
 		endpoints: {
 			metrics: {
@ -235,7 +237,7 @@ describe('GlobalConfig', () => {
 			formWaiting: 'form-waiting',
 			mcp: 'mcp',
 			mcpBuilderEnabled: true,
-			mcpMaxRegisteredClients: 200,
+			mcpMaxRegisteredClients: 5000,
 			mcpTest: 'mcp-test',
 			payloadSizeMax: 16,
 			formDataFileSizeMax: 200,
@ -284,6 +286,7 @@ describe('GlobalConfig', () => {
 			n8nSandboxServiceUrl: '',
 			n8nSandboxServiceApiKey: '',
 			sandboxTimeout: 300000,
+			builderSandboxTtlMs: 600_000,
 			braveSearchApiKey: '',
 			searxngUrl: '',
 			gatewayApiKey: '',
@ -368,6 +371,10 @@ describe('GlobalConfig', () => {
 			enabled: false,
 			ttl: 10,
 			interval: 3,
+			newLeaderElection: false,
+		},
+		evaluation: {
+			parallelExecutionEnabled: false,
 		},
 		generic: {
 			timezone: 'America/New_York',
--- a/packages/@n8n/create-node/package.json
+++ b/packages/@n8n/create-node/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/create-node",
-  "version": "0.28.0",
+  "version": "0.29.0",
  "description": "Official CLI to create new community nodes for n8n",
  "bin": {
    "create-node": "bin/create-node.cjs"
--- a/packages/@n8n/db/package.json
+++ b/packages/@n8n/db/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/db",
-  "version": "1.19.0",
+  "version": "1.20.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/db/src/entities/types-db.ts
+++ b/packages/@n8n/db/src/entities/types-db.ts
@ -208,6 +208,8 @@ export namespace ExecutionSummaries {
 		vote: AnnotationVote;
 		projectId: string;
 		workflowVersionId: string;
+		isArchived: boolean;
+		workflowBooleanSettings: Array<{ key: string; value: boolean }>;
 	}>;

 	export type StopExecutionFilterQuery = { workflowId: string } & Pick<
--- a/packages/@n8n/db/src/repositories/tests/workflow.repository.test.ts
+++ b/packages/@n8n/db/src/repositories/tests/workflow.repository.test.ts
@ -2,7 +2,7 @@ import { GlobalConfig } from '@n8n/config';
 import { In, type SelectQueryBuilder } from '@n8n/typeorm';
 import { mock } from 'jest-mock-extended';

-import { AiBuilderTemporaryWorkflow, WorkflowEntity } from '../../entities';
+import { WorkflowEntity } from '../../entities';
 import { mockEntityManager } from '../../utils/test-utils/mock-entity-manager';
 import { mockInstance } from '../../utils/test-utils/mock-instance';
 import { FolderRepository } from '../folder.repository';
@ -32,17 +32,9 @@ describe('WorkflowRepository', () => {
 		jest.resetAllMocks();

 		queryBuilder = mock<SelectQueryBuilder<WorkflowEntity>>();
-		const subQueryBuilder = mock<SelectQueryBuilder<AiBuilderTemporaryWorkflow>>();
-		subQueryBuilder.select.mockReturnThis();
-		subQueryBuilder.from.mockReturnThis();
-		subQueryBuilder.where.mockReturnThis();
-		subQueryBuilder.getQuery.mockReturnValue(
-			'(SELECT 1 FROM "ai_builder_temporary_workflow" "aitw" WHERE aitw."workflowId" = workflow.id)',
-		);

 		queryBuilder.where.mockReturnThis();
 		queryBuilder.andWhere.mockReturnThis();
-		queryBuilder.subQuery.mockReturnValue(subQueryBuilder);
 		queryBuilder.orWhere.mockReturnThis();
 		queryBuilder.select.mockReturnThis();
 		queryBuilder.addSelect.mockReturnThis();
@ -65,18 +57,6 @@ describe('WorkflowRepository', () => {
 		jest.spyOn(workflowRepository, 'createQueryBuilder').mockReturnValue(queryBuilder);
 	});

-	describe('applyAiBuilderTemporaryFilter', () => {
-		it('hides marker-table rows through a prefix-safe entity subquery', async () => {
-			await workflowRepository.getMany(['workflow1']);
-
-			expect(queryBuilder.subQuery).toHaveBeenCalled();
-			expect(queryBuilder.subQuery().from).toHaveBeenCalledWith(AiBuilderTemporaryWorkflow, 'aitw');
-			expect(queryBuilder.andWhere).toHaveBeenCalledWith(
-				expect.stringContaining('NOT EXISTS (SELECT 1 FROM "ai_builder_temporary_workflow"'),
-			);
-		});
-	});
-
 	describe('applyNameFilter', () => {
 		it('should search for workflows containing all words from the query', async () => {
 			const workflowIds = ['workflow1'];
--- a/packages/@n8n/db/src/repositories/execution.repository.ts
+++ b/packages/@n8n/db/src/repositories/execution.repository.ts
@ -57,6 +57,7 @@ import type {
 	IExecutionFlattedDb,
 	IExecutionResponse,
 } from '../entities/types-db';
+import { applyWorkflowBooleanSettingFilter } from '../utils/apply-workflow-boolean-setting-filter';
 import { separate } from '../utils/separate';

 class PostgresLiveRowsRetrievalError extends UnexpectedError {
@ -986,6 +987,8 @@ export class ExecutionRepository extends Repository<ExecutionEntity> {
 			vote,
 			projectId,
 			workflowVersionId,
+			isArchived,
+			workflowBooleanSettings,
 		} = query;

 		const fields = Object.keys(this.summaryFields)
@ -1089,6 +1092,16 @@ export class ExecutionRepository extends Repository<ExecutionEntity> {
 				.andWhere('sw.projectId = :projectId', { projectId });
 		}

+		if (isArchived !== undefined) {
+			qb.andWhere('workflow.isArchived = :isArchived', { isArchived });
+		}
+
+		if (workflowBooleanSettings?.length) {
+			for (const { key, value } of workflowBooleanSettings) {
+				applyWorkflowBooleanSettingFilter(qb, this.globalConfig, key, value);
+			}
+		}
+
 		return qb;
 	}

--- a/packages/@n8n/db/src/repositories/workflow.repository.ts
+++ b/packages/@n8n/db/src/repositories/workflow.repository.ts
@ -18,7 +18,6 @@ import { SharedWorkflowRepository } from './shared-workflow.repository';
 import { WorkflowHistoryRepository } from './workflow-history.repository';
 import {
 	WebhookEntity,
-	AiBuilderTemporaryWorkflow,
 	TagEntity,
 	WorkflowEntity,
 	WorkflowTagMapping,
@ -30,6 +29,7 @@ import type {
 	FolderWithWorkflowAndSubFolderCount,
 	ListQuery,
 } from '../entities/types-db';
+import { applyWorkflowBooleanSettingFilter } from '../utils/apply-workflow-boolean-setting-filter';
 import { buildWorkflowsByNodesQuery } from '../utils/build-workflows-by-nodes-query';
 import { isStringArray } from '../utils/is-string-array';
 import { TimedQuery } from '../utils/timed-query';
@ -883,23 +883,6 @@ export class WorkflowRepository extends Repository<WorkflowEntity> {
 		this.applyParentFolderFilter(qb, filter);
 		this.applyNodeTypesFilter(qb, filter);
 		this.applyAvailableInMCPFilter(qb, filter);
-		this.applyAiBuilderTemporaryFilter(qb);
-	}
-
-	/**
-	 * Hide workflows the AI builder created and has not yet promoted to the
-	 * main deliverable. The orchestrator clears the marker on the main at
-	 * build-time and reaps the rest at run-finish, but in the window between
-	 * create and reap, marked rows must not surface in the workflows list.
-	 */
-	private applyAiBuilderTemporaryFilter(qb: SelectQueryBuilder<WorkflowEntity>): void {
-		const markerSubquery = qb
-			.subQuery()
-			.select('1')
-			.from(AiBuilderTemporaryWorkflow, 'aitw')
-			.where('aitw."workflowId" = workflow.id')
-			.getQuery();
-		qb.andWhere(`NOT EXISTS ${markerSubquery}`);
 	}

 	private applyAvailableInMCPFilter(
@ -907,33 +890,15 @@ export class WorkflowRepository extends Repository<WorkflowEntity> {
 		filter: ListQuery.Options['filter'],
 	): void {
 		if (typeof filter?.availableInMCP === 'boolean') {
-			const dbType = this.globalConfig.database.type;
-
-			if (filter.availableInMCP) {
-				// When filtering for true, only match explicit true values
-				if (dbType === 'postgresdb') {
-					qb.andWhere("workflow.settings ->> 'availableInMCP' = :availableInMCP", {
-						availableInMCP: 'true',
-					});
-				} else if (dbType === 'sqlite') {
-					qb.andWhere("JSON_EXTRACT(workflow.settings, '$.availableInMCP') = :availableInMCP", {
-						availableInMCP: 1, // SQLite stores booleans as 0/1
-					});
-				}
-			} else {
-				// When filtering for false, match explicit false OR null/undefined (field not set)
-				if (dbType === 'postgresdb') {
-					qb.andWhere(
-						"(workflow.settings ->> 'availableInMCP' = :availableInMCP OR workflow.settings ->> 'availableInMCP' IS NULL)",
-						{ availableInMCP: 'false' },
-					);
-				} else if (dbType === 'sqlite') {
-					qb.andWhere(
-						"(JSON_EXTRACT(workflow.settings, '$.availableInMCP') = :availableInMCP OR JSON_EXTRACT(workflow.settings, '$.availableInMCP') IS NULL)",
-						{ availableInMCP: 0 }, // SQLite stores booleans as 0/1
-					);
-				}
-			}
+			applyWorkflowBooleanSettingFilter(
+				qb,
+				this.globalConfig,
+				'availableInMCP',
+				filter.availableInMCP,
+				{
+					includeNullOnFalse: true,
+				},
+			);
 		}
 	}

--- a/packages/@n8n/db/src/utils/tests/apply-workflow-boolean-setting-filter.test.ts
+++ b/packages/@n8n/db/src/utils/tests/apply-workflow-boolean-setting-filter.test.ts
@ -0,0 +1,136 @@
+import type { GlobalConfig } from '@n8n/config';
+import type { SelectQueryBuilder } from '@n8n/typeorm';
+
+import { applyWorkflowBooleanSettingFilter } from '../apply-workflow-boolean-setting-filter';
+
+function createMockQb() {
+	const qb = {
+		andWhere: jest.fn(),
+		where: jest.fn(),
+		orWhere: jest.fn(),
+	} as unknown as SelectQueryBuilder<object>;
+	return qb;
+}
+
+function createGlobalConfig(dbType: 'postgresdb' | 'sqlite') {
+	return { database: { type: dbType } } as GlobalConfig;
+}
+
+describe('applyWorkflowBooleanSettingFilter', () => {
+	describe('key validation', () => {
+		it('should reject keys with special characters', () => {
+			const qb = createMockQb();
+			expect(() =>
+				applyWorkflowBooleanSettingFilter(qb, createGlobalConfig('sqlite'), "'; DROP TABLE", true),
+			).toThrow('Invalid settings key');
+		});
+
+		it('should reject keys starting with a number', () => {
+			const qb = createMockQb();
+			expect(() =>
+				applyWorkflowBooleanSettingFilter(qb, createGlobalConfig('sqlite'), '1abc', true),
+			).toThrow('Invalid settings key');
+		});
+
+		it('should accept valid alphanumeric keys', () => {
+			const qb = createMockQb();
+			expect(() =>
+				applyWorkflowBooleanSettingFilter(qb, createGlobalConfig('sqlite'), 'availableInMCP', true),
+			).not.toThrow();
+		});
+	});
+
+	describe('postgres', () => {
+		const config = createGlobalConfig('postgresdb');
+
+		it('should filter for true values', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', true);
+
+			expect(qb.andWhere).toHaveBeenCalledWith(
+				"workflow.settings ->> 'availableInMCP' = :availableInMCP",
+				{ availableInMCP: 'true' },
+			);
+		});
+
+		it('should filter for false values', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', false);
+
+			expect(qb.andWhere).toHaveBeenCalledWith(
+				"(workflow.settings ->> 'availableInMCP' = :availableInMCP)",
+				{ availableInMCP: 'false' },
+			);
+		});
+
+		it('should include null clause when includeNullOnFalse is true', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', false, {
+				includeNullOnFalse: true,
+			});
+
+			expect(qb.andWhere).toHaveBeenCalledWith(
+				"(workflow.settings ->> 'availableInMCP' = :availableInMCP OR workflow.settings ->> 'availableInMCP' IS NULL)",
+				{ availableInMCP: 'false' },
+			);
+		});
+	});
+
+	describe('sqlite', () => {
+		const config = createGlobalConfig('sqlite');
+
+		it('should filter for true values', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', true);
+
+			expect(qb.andWhere).toHaveBeenCalledWith(
+				"JSON_EXTRACT(workflow.settings, '$.availableInMCP') = :availableInMCP",
+				{ availableInMCP: 1 },
+			);
+		});
+
+		it('should filter for false values', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', false);
+
+			expect(qb.andWhere).toHaveBeenCalledWith(
+				"(JSON_EXTRACT(workflow.settings, '$.availableInMCP') = :availableInMCP)",
+				{ availableInMCP: 0 },
+			);
+		});
+
+		it('should include null clause when includeNullOnFalse is true', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', false, {
+				includeNullOnFalse: true,
+			});
+
+			expect(qb.andWhere).toHaveBeenCalledWith(
+				"(JSON_EXTRACT(workflow.settings, '$.availableInMCP') = :availableInMCP OR JSON_EXTRACT(workflow.settings, '$.availableInMCP') IS NULL)",
+				{ availableInMCP: 0 },
+			);
+		});
+	});
+
+	describe('options', () => {
+		const config = createGlobalConfig('sqlite');
+
+		it('should use custom alias', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', true, { alias: 'wf' });
+
+			expect(qb.andWhere).toHaveBeenCalledWith(
+				"JSON_EXTRACT(wf.settings, '$.availableInMCP') = :availableInMCP",
+				{ availableInMCP: 1 },
+			);
+		});
+
+		it('should use custom method', () => {
+			const qb = createMockQb();
+			applyWorkflowBooleanSettingFilter(qb, config, 'availableInMCP', true, { method: 'where' });
+
+			expect(qb.where).toHaveBeenCalled();
+			expect(qb.andWhere).not.toHaveBeenCalled();
+		});
+	});
+});
--- a/packages/@n8n/db/src/utils/apply-workflow-boolean-setting-filter.ts
+++ b/packages/@n8n/db/src/utils/apply-workflow-boolean-setting-filter.ts
@ -0,0 +1,53 @@
+import type { GlobalConfig } from '@n8n/config';
+import type { SelectQueryBuilder } from '@n8n/typeorm';
+
+type BooleanSettingFilterOptions = {
+	alias?: string;
+	method?: 'where' | 'andWhere' | 'orWhere';
+	includeNullOnFalse?: boolean;
+};
+
+const VALID_KEY_PATTERN = /^[a-zA-Z][a-zA-Z0-9_]*$/;
+
+export function applyWorkflowBooleanSettingFilter<Entity extends object>(
+	qb: SelectQueryBuilder<Entity>,
+	globalConfig: GlobalConfig,
+	key: string,
+	value: boolean,
+	options: BooleanSettingFilterOptions = {},
+): void {
+	if (!VALID_KEY_PATTERN.test(key)) {
+		throw new Error(`Invalid settings key: ${key}`);
+	}
+
+	const { alias = 'workflow', method = 'andWhere', includeNullOnFalse = false } = options;
+	const dbType = globalConfig.database.type;
+	const settingsColumn = `${alias}.settings`;
+	const parameterName = key;
+
+	if (value) {
+		// When filtering for true, only match explicit true values.
+		if (dbType === 'postgresdb') {
+			qb[method](`${settingsColumn} ->> '${key}' = :${parameterName}`, {
+				[parameterName]: 'true',
+			});
+		} else if (dbType === 'sqlite') {
+			qb[method](`JSON_EXTRACT(${settingsColumn}, '$.${key}') = :${parameterName}`, {
+				[parameterName]: 1,
+			});
+		}
+	} else if (dbType === 'postgresdb') {
+		// Optionally treat null/undefined the same as false for settings that default to off.
+		const nullClause = includeNullOnFalse ? ` OR ${settingsColumn} ->> '${key}' IS NULL` : '';
+		qb[method](`(${settingsColumn} ->> '${key}' = :${parameterName}${nullClause})`, {
+			[parameterName]: 'false',
+		});
+	} else if (dbType === 'sqlite') {
+		// SQLite stores booleans as 0/1 inside JSON_EXTRACT results.
+		const extracted = `JSON_EXTRACT(${settingsColumn}, '$.${key}')`;
+		const nullClause = includeNullOnFalse ? ` OR ${extracted} IS NULL` : '';
+		qb[method](`(${extracted} = :${parameterName}${nullClause})`, {
+			[parameterName]: 0,
+		});
+	}
+}
--- a/packages/@n8n/decorators/package.json
+++ b/packages/@n8n/decorators/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/decorators",
-  "version": "1.19.0",
+  "version": "1.20.0",
  "scripts": {
    "clean": "rimraf dist .turbo",
    "dev": "pnpm watch",
--- a/packages/@n8n/eslint-plugin-community-nodes/README.md
+++ b/packages/@n8n/eslint-plugin-community-nodes/README.md
@ -60,8 +60,10 @@ export default [
 | [no-overrides-field](docs/rules/no-overrides-field.md)                                   | Ban the "overrides" field in community node package.json                                                                                    | ✅ ☑️ |      |    |    |    |
 | [no-restricted-globals](docs/rules/no-restricted-globals.md)                             | Disallow usage of restricted global variables in community nodes.                                                                           | ✅    |      |    |    |    |
 | [no-restricted-imports](docs/rules/no-restricted-imports.md)                             | Disallow usage of restricted imports in community nodes.                                                                                    | ✅    |      |    |    |    |
+| [no-runtime-dependencies](docs/rules/no-runtime-dependencies.md)                         | Disallow non-empty "dependencies" in community node package.json                                                                            | ✅ ☑️ |      |    |    |    |
 | [node-class-description-icon-missing](docs/rules/node-class-description-icon-missing.md) | Node class description must have an `icon` property defined. Deprecated: use `require-node-description-fields` instead.                     |      |      |    | 💡 | ❌  |
 | [node-connection-type-literal](docs/rules/node-connection-type-literal.md)               | Disallow string literals in node description `inputs`/`outputs` — use `NodeConnectionTypes` enum instead                                    | ✅ ☑️ |      | 🔧 |    |    |
+| [node-operation-error-itemindex](docs/rules/node-operation-error-itemindex.md)           | Require { itemIndex } in NodeOperationError / NodeApiError options inside item loops                                                        | ✅ ☑️ |      |    |    |    |
 | [node-usable-as-tool](docs/rules/node-usable-as-tool.md)                                 | Ensure node classes have usableAsTool property                                                                                              | ✅ ☑️ |      | 🔧 |    |    |
 | [options-sorted-alphabetically](docs/rules/options-sorted-alphabetically.md)             | Enforce alphabetical ordering of options arrays in n8n node properties                                                                      |      | ✅ ☑️ |    |    |    |
 | [package-name-convention](docs/rules/package-name-convention.md)                         | Enforce correct package naming convention for n8n community nodes                                                                           | ✅ ☑️ |      |    | 💡 |    |
@ -70,6 +72,7 @@ export default [
 | [require-node-api-error](docs/rules/require-node-api-error.md)                           | Require NodeApiError or NodeOperationError for error wrapping in catch blocks. Raw errors lose HTTP context in the n8n UI.                  | ✅ ☑️ |      |    |    |    |
 | [require-node-description-fields](docs/rules/require-node-description-fields.md)         | Node class description must define all required fields: icon, subtitle                                                                      | ✅ ☑️ |      |    |    |    |
 | [resource-operation-pattern](docs/rules/resource-operation-pattern.md)                   | Enforce proper resource/operation pattern for better UX in n8n nodes                                                                        |      | ✅ ☑️ |    |    |    |
+| [valid-credential-references](docs/rules/valid-credential-references.md)                 | Ensure credentials referenced in node descriptions exist as credential classes in the package                                               | ✅ ☑️ |      |    | 💡 |    |
 | [valid-peer-dependencies](docs/rules/valid-peer-dependencies.md)                         | Require community node package.json peerDependencies to contain only "n8n-workflow": "*" (and optionally "ai-node-sdk")                     | ✅ ☑️ |      | 🔧 |    |    |
 | [webhook-lifecycle-complete](docs/rules/webhook-lifecycle-complete.md)                   | Require webhook trigger nodes to implement the complete webhookMethods lifecycle (checkExists, create, delete)                              | ✅ ☑️ |      |    |    |    |

--- a/packages/@n8n/eslint-plugin-community-nodes/docs/rules/no-runtime-dependencies.md
+++ b/packages/@n8n/eslint-plugin-community-nodes/docs/rules/no-runtime-dependencies.md
@ -0,0 +1,58 @@
+# Disallow non-empty "dependencies" in community node package.json (`@n8n/community-nodes/no-runtime-dependencies`)
+
+💼 This rule is enabled in the following configs: ✅ `recommended`, ☑️ `recommendedWithoutN8nCloudSupport`.
+
+<!-- end auto-generated rule header -->
+
+## Rule Details
+
+The `dependencies` field in `package.json` declares packages that are installed alongside the node at runtime. In the context of n8n community nodes this is dangerous:
+
+- Community nodes run inside the shared n8n runtime alongside all other installed nodes. Any package listed in `dependencies` gets installed into that shared environment and can shadow or conflict with versions already used by n8n or other nodes.
+- Unlike application packages, community nodes should not own their runtime environment. Shared libraries must be declared in `peerDependencies` (so the host runtime supplies them) or bundled at build time into the published artifact.
+- A non-empty `dependencies` section is a strong signal that the package was scaffolded from a generic Node.js template without adapting it to the n8n community node model.
+
+## Examples
+
+### Incorrect
+
+```json
+{
+  "name": "n8n-nodes-example",
+  "dependencies": {
+    "axios": "1.0.0"
+  }
+}
+```
+
+```json
+{
+  "name": "n8n-nodes-example",
+  "dependencies": {
+    "axios": "1.7.0",
+    "fast-xml-parser": "4.4.0",
+    "minimatch": "9.0.5"
+  }
+}
+```
+
+### Correct
+
+```json
+{
+  "name": "n8n-nodes-example",
+  "peerDependencies": {
+    "n8n-workflow": "*"
+  }
+}
+```
+
+```json
+{
+  "name": "n8n-nodes-example",
+  "dependencies": {},
+  "peerDependencies": {
+    "n8n-workflow": "*"
+  }
+}
+```
--- a/packages/@n8n/eslint-plugin-community-nodes/docs/rules/node-operation-error-itemindex.md
+++ b/packages/@n8n/eslint-plugin-community-nodes/docs/rules/node-operation-error-itemindex.md
@ -0,0 +1,81 @@
+# Require { itemIndex } in NodeOperationError / NodeApiError options inside item loops (`@n8n/community-nodes/node-operation-error-itemindex`)
+
+💼 This rule is enabled in the following configs: ✅ `recommended`, ☑️ `recommendedWithoutN8nCloudSupport`.
+
+<!-- end auto-generated rule header -->
+
+## Rule Details
+
+When throwing `NodeOperationError` or `NodeApiError` inside the item-processing loop of an `execute()` method, the options object (third argument) must contain an `itemIndex` property. Without it, n8n cannot associate the error with the specific item that caused it, which breaks per-item error reporting and `continueOnFail` behaviour.
+
+The rule only fires inside **item loops** — `for` or `for...of` statements that iterate over the result of `this.getInputData()`. Errors thrown outside such loops (e.g. in webhook handlers, trigger setup, or credential testing helpers) are not flagged.
+
+## Examples
+
+### ❌ Incorrect
+
+```typescript
+export class MyNode implements INodeType {
+  description: INodeTypeDescription = { /* ... */ };
+
+  async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
+    const items = this.getInputData();
+    const returnData: INodeExecutionData[] = [];
+
+    for (let i = 0; i < items.length; i++) {
+      try {
+        // ...
+      } catch (error) {
+        // Missing { itemIndex } — n8n cannot map this error back to item i
+        throw new NodeOperationError(this.getNode(), error);
+      }
+    }
+
+    return [returnData];
+  }
+}
+```
+
+### ✅ Correct
+
+```typescript
+export class MyNode implements INodeType {
+  description: INodeTypeDescription = { /* ... */ };
+
+  async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
+    const items = this.getInputData();
+    const returnData: INodeExecutionData[] = [];
+
+    for (let i = 0; i < items.length; i++) {
+      try {
+        // ...
+      } catch (error) {
+        throw new NodeOperationError(this.getNode(), error, { itemIndex: i });
+      }
+    }
+
+    return [returnData];
+  }
+}
+```
+
+Using `for...of` with a named loop variable:
+
+```typescript
+async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
+  const items = this.getInputData();
+  const returnData: INodeExecutionData[] = [];
+  let itemIndex = 0;
+
+  for (const item of items) {
+    try {
+      // ...
+    } catch (error) {
+      throw new NodeApiError(this.getNode(), error, { itemIndex });
+    }
+    itemIndex++;
+  }
+
+  return [returnData];
+}
+```
--- a/packages/@n8n/eslint-plugin-community-nodes/docs/rules/valid-credential-references.md
+++ b/packages/@n8n/eslint-plugin-community-nodes/docs/rules/valid-credential-references.md
@ -0,0 +1,78 @@
+# Ensure credentials referenced in node descriptions exist as credential classes in the package (`@n8n/community-nodes/valid-credential-references`)
+
+💼 This rule is enabled in the following configs: ✅ `recommended`, ☑️ `recommendedWithoutN8nCloudSupport`.
+
+💡 This rule is manually fixable by [editor suggestions](https://eslint.org/docs/latest/use/core-concepts#rule-suggestions).
+
+<!-- end auto-generated rule header -->
+
+## Rule Details
+
+For each entry in `description.credentials[]`, this rule verifies that the referenced `name` matches the `name` class field of a credential class declared in the same package (as listed in `package.json` under `n8n.credentials`).
+
+This catches typos and broken references. When `cred-class-name-suffix` is also enabled, this rule naturally enforces the naming convention in the common case while still allowing legitimately named credentials such as `httpHeaderAuth` or `webhookAuth`.
+
+## Examples
+
+### ❌ Incorrect
+
+```typescript
+// MyApiCredential.credentials.ts
+export class MyApiCredential implements ICredentialType {
+  name = 'myApiCredential';
+  // ...
+}
+
+// package.json: "n8n": { "credentials": ["dist/credentials/MyApiCredential.credentials.js"] }
+
+export class MyNode implements INodeType {
+  description: INodeTypeDescription = {
+    credentials: [
+      {
+        name: 'myApiCredentail', // Typo — no credential with this name exists
+        required: true,
+      },
+    ],
+    // ...
+  };
+}
+```
+
+### ✅ Correct
+
+```typescript
+// MyApiCredential.credentials.ts
+export class MyApiCredential implements ICredentialType {
+  name = 'myApiCredential';
+  // ...
+}
+
+// package.json: "n8n": { "credentials": ["dist/credentials/MyApiCredential.credentials.js"] }
+
+export class MyNode implements INodeType {
+  description: INodeTypeDescription = {
+    credentials: [
+      {
+        name: 'myApiCredential', // Matches the credential class name property
+        required: true,
+      },
+    ],
+    // ...
+  };
+}
+```
+
+## Setup
+
+Declare your credential files in `package.json` so the rule can resolve credential class names:
+
+```json
+{
+  "name": "n8n-nodes-my-service",
+  "n8n": {
+    "credentials": [
+      "dist/credentials/MyApiCredential.credentials.js"
+    ]
+  }
+}
+```
--- a/packages/@n8n/eslint-plugin-community-nodes/package.json
+++ b/packages/@n8n/eslint-plugin-community-nodes/package.json
@ -1,7 +1,7 @@
 {
  "name": "@n8n/eslint-plugin-community-nodes",
  "type": "module",
-  "version": "0.14.0",
+  "version": "0.15.0",
  "main": "./dist/plugin.js",
  "types": "./dist/plugin.d.ts",
  "exports": {
--- a/packages/@n8n/eslint-plugin-community-nodes/src/plugin.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/plugin.ts
@ -32,6 +32,7 @@ const configs = {
 			'@n8n/community-nodes/no-forbidden-lifecycle-scripts': 'error',
 			'@n8n/community-nodes/no-http-request-with-manual-auth': 'error',
 			'@n8n/community-nodes/no-overrides-field': 'error',
+			'@n8n/community-nodes/no-runtime-dependencies': 'error',
 			'@n8n/community-nodes/icon-validation': 'error',
 			'@n8n/community-nodes/options-sorted-alphabetically': 'warn',
 			'@n8n/community-nodes/resource-operation-pattern': 'warn',
@ -39,10 +40,12 @@ const configs = {
 			'@n8n/community-nodes/cred-class-field-icon-missing': 'error',
 			'@n8n/community-nodes/node-connection-type-literal': 'error',
 			'@n8n/community-nodes/missing-paired-item': 'error',
+			'@n8n/community-nodes/node-operation-error-itemindex': 'error',
 			'@n8n/community-nodes/require-community-node-keyword': 'warn',
 			'@n8n/community-nodes/require-continue-on-fail': 'error',
 			'@n8n/community-nodes/require-node-api-error': 'error',
 			'@n8n/community-nodes/require-node-description-fields': 'error',
+			'@n8n/community-nodes/valid-credential-references': 'error',
 			'@n8n/community-nodes/valid-peer-dependencies': 'error',
 			'@n8n/community-nodes/webhook-lifecycle-complete': 'error',
 		},
@ -63,6 +66,7 @@ const configs = {
 			'@n8n/community-nodes/no-forbidden-lifecycle-scripts': 'error',
 			'@n8n/community-nodes/no-http-request-with-manual-auth': 'error',
 			'@n8n/community-nodes/no-overrides-field': 'error',
+			'@n8n/community-nodes/no-runtime-dependencies': 'error',
 			'@n8n/community-nodes/icon-validation': 'error',
 			'@n8n/community-nodes/options-sorted-alphabetically': 'warn',
 			'@n8n/community-nodes/credential-documentation-url': 'error',
@ -70,10 +74,12 @@ const configs = {
 			'@n8n/community-nodes/cred-class-field-icon-missing': 'error',
 			'@n8n/community-nodes/node-connection-type-literal': 'error',
 			'@n8n/community-nodes/missing-paired-item': 'error',
+			'@n8n/community-nodes/node-operation-error-itemindex': 'error',
 			'@n8n/community-nodes/require-community-node-keyword': 'warn',
 			'@n8n/community-nodes/require-continue-on-fail': 'error',
 			'@n8n/community-nodes/require-node-api-error': 'error',
 			'@n8n/community-nodes/require-node-description-fields': 'error',
+			'@n8n/community-nodes/valid-credential-references': 'error',
 			'@n8n/community-nodes/valid-peer-dependencies': 'error',
 			'@n8n/community-nodes/webhook-lifecycle-complete': 'error',
 		},
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/index.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/index.ts
@ -14,8 +14,10 @@ import { NoHttpRequestWithManualAuthRule } from './no-http-request-with-manual-a
 import { NoOverridesFieldRule } from './no-overrides-field.js';
 import { NoRestrictedGlobalsRule } from './no-restricted-globals.js';
 import { NoRestrictedImportsRule } from './no-restricted-imports.js';
+import { NoRuntimeDependenciesRule } from './no-runtime-dependencies.js';
 import { NodeClassDescriptionIconMissingRule } from './node-class-description-icon-missing.js';
 import { NodeConnectionTypeLiteralRule } from './node-connection-type-literal.js';
+import { NodeOperationErrorItemIndexRule } from './node-operation-error-itemindex.js';
 import { NodeUsableAsToolRule } from './node-usable-as-tool.js';
 import { OptionsSortedAlphabeticallyRule } from './options-sorted-alphabetically.js';
 import { PackageNameConventionRule } from './package-name-convention.js';
@ -24,6 +26,7 @@ import { RequireContinueOnFailRule } from './require-continue-on-fail.js';
 import { RequireNodeApiErrorRule } from './require-node-api-error.js';
 import { RequireNodeDescriptionFieldsRule } from './require-node-description-fields.js';
 import { ResourceOperationPatternRule } from './resource-operation-pattern.js';
+import { ValidCredentialReferencesRule } from './valid-credential-references.js';
 import { ValidPeerDependenciesRule } from './valid-peer-dependencies.js';
 import { WebhookLifecycleCompleteRule } from './webhook-lifecycle-complete.js';

@ -41,17 +44,20 @@ export const rules = {
 	'no-forbidden-lifecycle-scripts': NoForbiddenLifecycleScriptsRule,
 	'no-http-request-with-manual-auth': NoHttpRequestWithManualAuthRule,
 	'no-overrides-field': NoOverridesFieldRule,
+	'no-runtime-dependencies': NoRuntimeDependenciesRule,
 	'icon-validation': IconValidationRule,
 	'resource-operation-pattern': ResourceOperationPatternRule,
 	'credential-documentation-url': CredentialDocumentationUrlRule,
 	'node-class-description-icon-missing': NodeClassDescriptionIconMissingRule,
 	'cred-class-field-icon-missing': CredClassFieldIconMissingRule,
 	'node-connection-type-literal': NodeConnectionTypeLiteralRule,
+	'node-operation-error-itemindex': NodeOperationErrorItemIndexRule,
 	'missing-paired-item': MissingPairedItemRule,
 	'require-community-node-keyword': RequireCommunityNodeKeywordRule,
 	'require-continue-on-fail': RequireContinueOnFailRule,
 	'require-node-api-error': RequireNodeApiErrorRule,
 	'require-node-description-fields': RequireNodeDescriptionFieldsRule,
+	'valid-credential-references': ValidCredentialReferencesRule,
 	'valid-peer-dependencies': ValidPeerDependenciesRule,
 	'webhook-lifecycle-complete': WebhookLifecycleCompleteRule,
 } satisfies Record<string, AnyRuleModule>;
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-runtime-dependencies.test.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-runtime-dependencies.test.ts
@ -0,0 +1,50 @@
+import { RuleTester } from '@typescript-eslint/rule-tester';
+
+import { NoRuntimeDependenciesRule } from './no-runtime-dependencies.js';
+
+const ruleTester = new RuleTester();
+
+ruleTester.run('no-runtime-dependencies', NoRuntimeDependenciesRule, {
+	valid: [
+		{
+			name: 'no dependencies field',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-example", "version": "1.0.0" }',
+		},
+		{
+			name: 'empty dependencies object is allowed',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-example", "dependencies": {} }',
+		},
+		{
+			name: 'non-package.json file is ignored',
+			filename: 'some-config.json',
+			code: '{ "dependencies": { "axios": "1.0.0" } }',
+		},
+		{
+			name: 'nested "dependencies" key inside another field is allowed',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-example", "config": { "dependencies": { "axios": "1.0.0" } } }',
+		},
+	],
+	invalid: [
+		{
+			name: 'single runtime dependency is forbidden',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-example", "dependencies": { "axios": "1.0.0" } }',
+			errors: [{ messageId: 'runtimeDependenciesForbidden' }],
+		},
+		{
+			name: 'multiple runtime dependencies are forbidden',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-example", "dependencies": { "axios": "1.0.0", "lodash": "^4.0.0" } }',
+			errors: [{ messageId: 'runtimeDependenciesForbidden' }],
+		},
+		{
+			name: 'real-world package with bundled deps is forbidden',
+			filename: 'package.json',
+			code: '{ "name": "n8n-nodes-sinch", "dependencies": { "axios": "1.7.0", "fast-xml-parser": "4.4.0", "minimatch": "9.0.5" } }',
+			errors: [{ messageId: 'runtimeDependenciesForbidden' }],
+		},
+	],
+});
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-runtime-dependencies.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/no-runtime-dependencies.ts
@ -0,0 +1,50 @@
+import type { TSESTree } from '@typescript-eslint/utils';
+import { AST_NODE_TYPES } from '@typescript-eslint/utils';
+
+import { createRule, findJsonProperty } from '../utils/index.js';
+
+export const NoRuntimeDependenciesRule = createRule({
+	name: 'no-runtime-dependencies',
+	meta: {
+		type: 'problem',
+		docs: {
+			description: 'Disallow non-empty "dependencies" in community node package.json',
+		},
+		messages: {
+			runtimeDependenciesForbidden:
+				'The "dependencies" field must be empty or absent in community node packages. Runtime dependencies get bundled into the n8n instance and can conflict with other nodes or the n8n runtime itself. Move shared libraries to "peerDependencies" or bundle them into your build artifact.',
+		},
+		schema: [],
+	},
+	defaultOptions: [],
+	create(context) {
+		if (!context.filename.endsWith('package.json')) {
+			return {};
+		}
+
+		return {
+			ObjectExpression(node: TSESTree.ObjectExpression) {
+				if (node.parent?.type !== AST_NODE_TYPES.ExpressionStatement) {
+					return;
+				}
+
+				const depsProp = findJsonProperty(node, 'dependencies');
+				if (!depsProp) {
+					return;
+				}
+
+				if (
+					depsProp.value.type !== AST_NODE_TYPES.ObjectExpression ||
+					depsProp.value.properties.length === 0
+				) {
+					return;
+				}
+
+				context.report({
+					node: depsProp,
+					messageId: 'runtimeDependenciesForbidden',
+				});
+			},
+		};
+	},
+});
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/node-operation-error-itemindex.test.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/node-operation-error-itemindex.test.ts
@ -0,0 +1,280 @@
+import { RuleTester } from '@typescript-eslint/rule-tester';
+
+import { NodeOperationErrorItemIndexRule } from './node-operation-error-itemindex.js';
+
+const ruleTester = new RuleTester();
+
+const NODE_FILENAME = 'TestNode.node.ts';
+
+function createNodeWithExecute(executeBody: string): { filename: string; code: string } {
+	return {
+		filename: NODE_FILENAME,
+		code: `
+import type { INodeType, INodeTypeDescription, IExecuteFunctions, INodeExecutionData } from 'n8n-workflow';
+import { NodeOperationError, NodeApiError } from 'n8n-workflow';
+
+export class TestNode implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Test Node',
+		name: 'testNode',
+		group: ['input'],
+		version: 1,
+		description: 'A test node',
+		defaults: { name: 'Test Node' },
+		inputs: ['main'],
+		outputs: ['main'],
+		properties: [],
+	};
+
+	async execute(this: IExecuteFunctions): Promise<INodeExecutionData[][]> {
+		${executeBody}
+	}
+}`,
+	};
+}
+
+ruleTester.run('node-operation-error-itemindex', NodeOperationErrorItemIndexRule, {
+	valid: [
+		{
+			name: 'non-node class is ignored',
+			filename: NODE_FILENAME,
+			code: `
+export class RegularClass {
+	async execute() {
+		const items = this.getInputData();
+		for (let i = 0; i < items.length; i++) {
+			throw new NodeOperationError(this.getNode(), 'error');
+		}
+	}
+}`,
+		},
+		{
+			name: 'NodeOperationError outside any loop is allowed',
+			...createNodeWithExecute(`
+				throw new NodeOperationError(this.getNode(), 'some error');
+			`),
+		},
+		{
+			name: 'NodeOperationError in a non-item loop is allowed',
+			...createNodeWithExecute(`
+				const settings = ['a', 'b', 'c'];
+				for (let i = 0; i < settings.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error');
+				}
+			`),
+		},
+		{
+			name: 'NodeOperationError with itemIndex in C-style for loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error', { itemIndex: i });
+				}
+			`),
+		},
+		{
+			name: 'NodeOperationError with itemIndex shorthand in C-style for loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
+					throw new NodeOperationError(this.getNode(), 'error', { itemIndex });
+				}
+			`),
+		},
+		{
+			name: 'NodeApiError with itemIndex in C-style for loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeApiError(this.getNode(), error, { itemIndex: i });
+				}
+			`),
+		},
+		{
+			name: 'NodeOperationError with itemIndex in for...of loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (const [i, item] of items.entries()) {
+					throw new NodeOperationError(this.getNode(), 'error', { itemIndex: i });
+				}
+			`),
+		},
+		{
+			name: 'NodeOperationError with itemIndex in for...of directly over getInputData()',
+			...createNodeWithExecute(`
+				let i = 0;
+				for (const item of this.getInputData()) {
+					throw new NodeOperationError(this.getNode(), 'error', { itemIndex: i++ });
+				}
+			`),
+		},
+		{
+			name: 'NodeOperationError with variable as 3rd arg (cannot statically verify — skip)',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					const opts = { itemIndex: i };
+					throw new NodeOperationError(this.getNode(), 'error', opts);
+				}
+			`),
+		},
+		{
+			name: 'NodeOperationError with spread plus explicit itemIndex in options',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error', { ...opts, itemIndex: i });
+				}
+			`),
+		},
+		{
+			name: 'NodeOperationError outside execute() method is not flagged',
+			filename: NODE_FILENAME,
+			code: `
+import type { INodeType, INodeTypeDescription, IWebhookFunctions, IWebhookResponseData } from 'n8n-workflow';
+import { NodeOperationError } from 'n8n-workflow';
+
+export class TestNode implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Test Node',
+		name: 'testNode',
+		group: ['trigger'],
+		version: 1,
+		description: 'A test node',
+		defaults: { name: 'Test Node' },
+		inputs: [],
+		outputs: ['main'],
+		webhooks: [{ name: 'default', httpMethod: 'POST', responseMode: 'onReceived', path: 'webhook' }],
+		properties: [],
+	};
+
+	async webhook(this: IWebhookFunctions): Promise<IWebhookResponseData> {
+		const items = this.getInputData();
+		for (let i = 0; i < items.length; i++) {
+			throw new NodeOperationError(this.getNode(), 'webhook error');
+		}
+		return { workflowData: [[]] };
+	}
+}`,
+		},
+		{
+			name: 'NodeOperationError in nested non-item for loop inside item loop is allowed',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					const options = ['a', 'b'];
+					for (let j = 0; j < options.length; j++) {
+						throw new NodeOperationError(this.getNode(), 'error', { itemIndex: i });
+					}
+				}
+			`),
+		},
+	],
+	invalid: [
+		{
+			name: 'NodeOperationError without any options in C-style for loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error');
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+		{
+			name: 'NodeOperationError with empty options object in C-style for loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error', {});
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+		{
+			name: 'NodeOperationError with options but missing itemIndex in C-style for loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error', { description: 'something' });
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+		{
+			name: 'NodeApiError without itemIndex in C-style for loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeApiError(this.getNode(), error);
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeApiError' } }],
+		},
+		{
+			name: 'NodeOperationError without itemIndex in for...of over items variable',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (const item of items) {
+					throw new NodeOperationError(this.getNode(), 'error');
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+		{
+			name: 'NodeOperationError without itemIndex in for...of directly over getInputData()',
+			...createNodeWithExecute(`
+				for (const item of this.getInputData()) {
+					throw new NodeOperationError(this.getNode(), 'error');
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+		{
+			name: 'multiple errors in the same item loop',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					if (someCondition) {
+						throw new NodeOperationError(this.getNode(), 'error A');
+					}
+					throw new NodeApiError(this.getNode(), error);
+				}
+			`),
+			errors: [
+				{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } },
+				{ messageId: 'missingItemIndex', data: { errorClass: 'NodeApiError' } },
+			],
+		},
+		{
+			name: 'NodeOperationError without itemIndex when loop variable is named itemIndex',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
+					throw new NodeOperationError(this.getNode(), 'error', { description: 'oops' });
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+		{
+			name: 'NodeOperationError with spread-only options (spread does not guarantee itemIndex)',
+			...createNodeWithExecute(`
+				const items = this.getInputData();
+				for (let i = 0; i < items.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error', { ...opts });
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+		{
+			name: 'NodeOperationError without itemIndex with non-standard items variable name',
+			...createNodeWithExecute(`
+				const inputItems = this.getInputData();
+				for (let i = 0; i < inputItems.length; i++) {
+					throw new NodeOperationError(this.getNode(), 'error');
+				}
+			`),
+			errors: [{ messageId: 'missingItemIndex', data: { errorClass: 'NodeOperationError' } }],
+		},
+	],
+});
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/node-operation-error-itemindex.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/node-operation-error-itemindex.ts
@ -0,0 +1,223 @@
+/**
+ * Flags `new NodeOperationError(...)` or `new NodeApiError(...)` inside item
+ * loops in `execute()` methods that omit `{ itemIndex }` from the options
+ * argument. Without it, n8n cannot associate the error with the specific item
+ * that caused it, breaking per-item error reporting and `continueOnFail`.
+ *
+ * "Item loop" means a `for` or `for...of` that iterates over the result of
+ * `this.getInputData()` (or a variable initialised from it). Errors outside
+ * such loops — e.g. in webhook handlers or trigger setup — are not flagged.
+ */
+
+import { AST_NODE_TYPES, type TSESTree } from '@typescript-eslint/utils';
+
+import { createRule, findObjectProperty, isFileType, isNodeTypeClass } from '../utils/index.js';
+
+const ITEM_ERROR_CLASSES = new Set(['NodeOperationError', 'NodeApiError']);
+
+/** Returns true when `node` is a bare `this.getInputData(...)` call. */
+function isGetInputDataCall(node: TSESTree.CallExpression): boolean {
+	return (
+		node.callee.type === AST_NODE_TYPES.MemberExpression &&
+		node.callee.object.type === AST_NODE_TYPES.ThisExpression &&
+		node.callee.property.type === AST_NODE_TYPES.Identifier &&
+		node.callee.property.name === 'getInputData'
+	);
+}
+
+/** Returns true when `node` is `<varName>.length` for any name in `varNames`. */
+function isLengthAccessOnVariable(node: TSESTree.Node, varNames: Set<string>): boolean {
+	return (
+		node.type === AST_NODE_TYPES.MemberExpression &&
+		!node.computed &&
+		node.property.type === AST_NODE_TYPES.Identifier &&
+		node.property.name === 'length' &&
+		node.object.type === AST_NODE_TYPES.Identifier &&
+		varNames.has(node.object.name)
+	);
+}
+
+/**
+ * Returns true when the `for` test condition references `<itemVar>.length`,
+ * indicating that the loop iterates over an items array.
+ */
+function isItemForLoop(node: TSESTree.ForStatement, itemVarNames: Set<string>): boolean {
+	if (!node.test || node.test.type !== AST_NODE_TYPES.BinaryExpression) return false;
+
+	const { left, right } = node.test;
+	return (
+		isLengthAccessOnVariable(left, itemVarNames) || isLengthAccessOnVariable(right, itemVarNames)
+	);
+}
+
+/**
+ * Returns true when the `for...of` iterable is an items variable or a direct
+ * `this.getInputData()` call.
+ */
+function isItemForOfLoop(node: TSESTree.ForOfStatement, itemVarNames: Set<string>): boolean {
+	const { right } = node;
+
+	if (right.type === AST_NODE_TYPES.Identifier && itemVarNames.has(right.name)) {
+		return true;
+	}
+
+	return right.type === AST_NODE_TYPES.CallExpression && isGetInputDataCall(right);
+}
+
+/**
+ * Returns true when the `NodeOperationError` / `NodeApiError` constructor call
+ * already has an `{ itemIndex }` property in its options argument, or when the
+ * options argument cannot be statically inspected (variable / spread) — in
+ * which case we give the benefit of the doubt.
+ */
+function hasItemIndexOption(node: TSESTree.NewExpression): boolean {
+	const { arguments: args } = node;
+
+	if (args.length < 3) return false;
+
+	const optionsArg = args[2];
+
+	// Non-object-literal (bare variable reference) — can't statically check, assume OK.
+	if (!optionsArg || optionsArg.type !== AST_NODE_TYPES.ObjectExpression) {
+		return true;
+	}
+
+	// itemIndex must be an explicit own property of the options object.
+	// Spread elements (e.g. { ...opts }) are not sufficient — they may not
+	// include itemIndex and would silently bypass this requirement.
+	return findObjectProperty(optionsArg, 'itemIndex') !== null;
+}
+
+export const NodeOperationErrorItemIndexRule = createRule({
+	name: 'node-operation-error-itemindex',
+	meta: {
+		type: 'problem',
+		docs: {
+			description:
+				'Require { itemIndex } in NodeOperationError / NodeApiError options inside item loops',
+		},
+		messages: {
+			missingItemIndex:
+				'`new {{ errorClass }}(...)` inside an item loop must include `{ itemIndex }` as the ' +
+				'third argument so n8n can associate the error with the failing item.',
+		},
+		schema: [],
+	},
+	defaultOptions: [],
+	create(context) {
+		if (!isFileType(context.filename, '.node.ts')) {
+			return {};
+		}
+
+		let inNodeTypeClass = false;
+		let inExecuteMethod = false;
+
+		/** Names of variables initialised from `this.getInputData()` in the current execute() scope. */
+		const itemVariableNames = new Set<string>();
+
+		/** AST nodes for loops that are confirmed item loops. */
+		const itemLoopNodes = new Set<TSESTree.ForStatement | TSESTree.ForOfStatement>();
+
+		/** Number of currently open item loops (supports nested loops). */
+		let itemLoopDepth = 0;
+
+		function resetExecuteState() {
+			inExecuteMethod = false;
+			itemVariableNames.clear();
+			itemLoopNodes.clear();
+			itemLoopDepth = 0;
+		}
+
+		return {
+			ClassDeclaration(node) {
+				if (isNodeTypeClass(node)) {
+					inNodeTypeClass = true;
+				}
+			},
+
+			'ClassDeclaration:exit'() {
+				inNodeTypeClass = false;
+				resetExecuteState();
+			},
+
+			MethodDefinition(node: TSESTree.MethodDefinition) {
+				if (
+					inNodeTypeClass &&
+					node.key.type === AST_NODE_TYPES.Identifier &&
+					node.key.name === 'execute'
+				) {
+					inExecuteMethod = true;
+				}
+			},
+
+			'MethodDefinition:exit'(node: TSESTree.MethodDefinition) {
+				if (
+					inExecuteMethod &&
+					node.key.type === AST_NODE_TYPES.Identifier &&
+					node.key.name === 'execute'
+				) {
+					resetExecuteState();
+				}
+			},
+
+			VariableDeclarator(node: TSESTree.VariableDeclarator) {
+				if (!inExecuteMethod) return;
+				if (!node.init) return;
+				if (node.id.type !== AST_NODE_TYPES.Identifier) return;
+
+				if (node.init.type === AST_NODE_TYPES.CallExpression && isGetInputDataCall(node.init)) {
+					itemVariableNames.add(node.id.name);
+				}
+			},
+
+			ForStatement(node: TSESTree.ForStatement) {
+				if (!inExecuteMethod) return;
+				if (isItemForLoop(node, itemVariableNames)) {
+					itemLoopNodes.add(node);
+					itemLoopDepth++;
+				}
+			},
+
+			'ForStatement:exit'(node: TSESTree.ForStatement) {
+				if (itemLoopNodes.has(node)) {
+					itemLoopNodes.delete(node);
+					itemLoopDepth--;
+				}
+			},
+
+			ForOfStatement(node: TSESTree.ForOfStatement) {
+				if (!inExecuteMethod) return;
+				if (isItemForOfLoop(node, itemVariableNames)) {
+					itemLoopNodes.add(node);
+					itemLoopDepth++;
+				}
+			},
+
+			'ForOfStatement:exit'(node: TSESTree.ForOfStatement) {
+				if (itemLoopNodes.has(node)) {
+					itemLoopNodes.delete(node);
+					itemLoopDepth--;
+				}
+			},
+
+			NewExpression(node: TSESTree.NewExpression) {
+				if (itemLoopDepth === 0) return;
+
+				if (
+					node.callee.type !== AST_NODE_TYPES.Identifier ||
+					!ITEM_ERROR_CLASSES.has(node.callee.name)
+				) {
+					return;
+				}
+
+				if (!hasItemIndexOption(node)) {
+					context.report({
+						node,
+						messageId: 'missingItemIndex',
+						data: { errorClass: node.callee.name },
+					});
+				}
+			},
+		};
+	},
+});
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/valid-credential-references.test.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/valid-credential-references.test.ts
@ -0,0 +1,230 @@
+import { RuleTester } from '@typescript-eslint/rule-tester';
+import { afterEach, beforeEach, describe, vi } from 'vitest';
+
+import { ValidCredentialReferencesRule } from './valid-credential-references.js';
+import * as fileUtils from '../utils/file-utils.js';
+
+vi.mock('../utils/file-utils.js', async () => {
+	const actual = await vi.importActual('../utils/file-utils.js');
+	return {
+		...actual,
+		readPackageJsonCredentials: vi.fn(),
+		findPackageJson: vi.fn(),
+	};
+});
+
+const mockReadPackageJsonCredentials = vi.mocked(fileUtils.readPackageJsonCredentials);
+const mockFindPackageJson = vi.mocked(fileUtils.findPackageJson);
+
+const ruleTester = new RuleTester();
+
+const nodeFilePath = '/tmp/TestNode.node.ts';
+
+function createNodeCode(
+	credentials: Array<string | { name: string; required?: boolean }> = [],
+): string {
+	const credentialsArray =
+		credentials.length > 0
+			? credentials
+					.map((cred) => {
+						if (typeof cred === 'string') {
+							return `'${cred}'`;
+						} else {
+							const required =
+								cred.required !== undefined ? `,\n\t\t\t\trequired: ${cred.required}` : '';
+							return `{\n\t\t\t\tname: '${cred.name}'${required},\n\t\t\t}`;
+						}
+					})
+					.join(',\n\t\t\t')
+			: '';
+
+	const credentialsProperty =
+		credentials.length > 0 ? `credentials: [\n\t\t\t${credentialsArray}\n\t\t],` : '';
+
+	return `
+import type { INodeType, INodeTypeDescription } from 'n8n-workflow';
+
+export class TestNode implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Test Node',
+		name: 'testNode',
+		group: ['output'],
+		version: 1,
+		inputs: ['main'],
+		outputs: ['main'],
+		${credentialsProperty}
+		properties: [],
+	};
+}`;
+}
+
+/** Same as createNodeCode but uses double quotes for the credential name — matches fixer output */
+function createExpectedNodeCode(
+	credentials: Array<string | { name: string; required?: boolean }> = [],
+): string {
+	const credentialsArray =
+		credentials.length > 0
+			? credentials
+					.map((cred) => {
+						if (typeof cred === 'string') {
+							return `"${cred}"`;
+						} else {
+							const required =
+								cred.required !== undefined ? `,\n\t\t\t\trequired: ${cred.required}` : '';
+							return `{\n\t\t\t\tname: "${cred.name}"${required},\n\t\t\t}`;
+						}
+					})
+					.join(',\n\t\t\t')
+			: '';
+
+	const credentialsProperty =
+		credentials.length > 0 ? `credentials: [\n\t\t\t${credentialsArray}\n\t\t],` : '';
+
+	return `
+import type { INodeType, INodeTypeDescription } from 'n8n-workflow';
+
+export class TestNode implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Test Node',
+		name: 'testNode',
+		group: ['output'],
+		version: 1,
+		inputs: ['main'],
+		outputs: ['main'],
+		${credentialsProperty}
+		properties: [],
+	};
+}`;
+}
+
+function createNonNodeClass(): string {
+	return `
+export class RegularClass {
+	credentials = [
+		{ name: 'ExternalApi', required: true }
+	];
+}`;
+}
+
+function createNonINodeTypeClass(): string {
+	return `
+export class NotANode {
+	description = {
+		displayName: 'Not A Node',
+		credentials: [
+			{ name: 'ExternalApi', required: true }
+		]
+	};
+}`;
+}
+
+mockFindPackageJson.mockReturnValue('/tmp/package.json');
+mockReadPackageJsonCredentials.mockReturnValue(new Set(['myApiCredential', 'oauthApi']));
+
+ruleTester.run('valid-credential-references', ValidCredentialReferencesRule, {
+	valid: [
+		{
+			name: 'node referencing a credential that exists (object form)',
+			filename: nodeFilePath,
+			code: createNodeCode([{ name: 'myApiCredential', required: true }]),
+		},
+		{
+			name: 'node referencing a credential that exists (string form)',
+			filename: nodeFilePath,
+			code: createNodeCode(['myApiCredential']),
+		},
+		{
+			name: 'node referencing multiple credentials that all exist',
+			filename: nodeFilePath,
+			code: createNodeCode(['myApiCredential', { name: 'oauthApi', required: false }]),
+		},
+		{
+			name: 'node without credentials array',
+			filename: nodeFilePath,
+			code: createNodeCode(),
+		},
+		{
+			name: 'non-node file ignored',
+			filename: '/tmp/regular-file.ts',
+			code: createNonNodeClass(),
+		},
+		{
+			name: 'non-INodeType class ignored',
+			filename: nodeFilePath,
+			code: createNonINodeTypeClass(),
+		},
+	],
+	invalid: [
+		{
+			name: 'credential name does not exist in package (object form)',
+			filename: nodeFilePath,
+			code: createNodeCode([{ name: 'brokenReference', required: true }]),
+			errors: [
+				{
+					messageId: 'credentialNotFound',
+					data: { credentialName: 'brokenReference' },
+				},
+			],
+		},
+		{
+			name: 'credential name does not exist in package (string form)',
+			filename: nodeFilePath,
+			code: createNodeCode(['unknownCredential']),
+			errors: [
+				{
+					messageId: 'credentialNotFound',
+					data: { credentialName: 'unknownCredential' },
+				},
+			],
+		},
+		{
+			name: 'credential name is a typo close to an existing credential — suggestion provided',
+			filename: nodeFilePath,
+			code: createNodeCode([{ name: 'myApiCredentail', required: true }]),
+			errors: [
+				{
+					messageId: 'credentialNotFound',
+					data: { credentialName: 'myApiCredentail' },
+					suggestions: [
+						{
+							messageId: 'didYouMean',
+							data: { suggestedName: 'myApiCredential' },
+							output: createExpectedNodeCode([{ name: 'myApiCredential', required: true }]),
+						},
+					],
+				},
+			],
+		},
+		{
+			name: 'mix of valid and invalid credentials — only invalid reported',
+			filename: nodeFilePath,
+			code: createNodeCode(['myApiCredential', { name: 'brokenRef', required: true }]),
+			errors: [
+				{
+					messageId: 'credentialNotFound',
+					data: { credentialName: 'brokenRef' },
+				},
+			],
+		},
+	],
+});
+
+describe('valid-credential-references — no package.json found', () => {
+	beforeEach(() => {
+		mockFindPackageJson.mockReturnValue(null);
+	});
+	afterEach(() => {
+		mockFindPackageJson.mockReturnValue('/tmp/package.json');
+	});
+
+	ruleTester.run('valid-credential-references (no package.json)', ValidCredentialReferencesRule, {
+		valid: [
+			{
+				name: 'check is skipped when package.json cannot be found',
+				filename: nodeFilePath,
+				code: createNodeCode([{ name: 'anyCredential', required: true }]),
+			},
+		],
+		invalid: [],
+	});
+});
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/valid-credential-references.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/valid-credential-references.ts
@ -0,0 +1,105 @@
+import { TSESTree } from '@typescript-eslint/types';
+import type { ReportSuggestionArray } from '@typescript-eslint/utils/ts-eslint';
+
+import {
+	isNodeTypeClass,
+	findClassProperty,
+	findArrayLiteralProperty,
+	extractCredentialNameFromArray,
+	findPackageJson,
+	readPackageJsonCredentials,
+	isFileType,
+	findSimilarStrings,
+	createRule,
+} from '../utils/index.js';
+
+export const ValidCredentialReferencesRule = createRule({
+	name: 'valid-credential-references',
+	meta: {
+		type: 'problem',
+		docs: {
+			description:
+				'Ensure credentials referenced in node descriptions exist as credential classes in the package',
+		},
+		messages: {
+			credentialNotFound:
+				'Credential "{{ credentialName }}" does not exist in this package. Check for typos or ensure the credential class is declared and listed in package.json.',
+			didYouMean: "Did you mean '{{ suggestedName }}'?",
+		},
+		schema: [],
+		hasSuggestions: true,
+	},
+	defaultOptions: [],
+	create(context) {
+		if (!isFileType(context.filename, '.node.ts')) {
+			return {};
+		}
+
+		let packageCredentials: Set<string> | null = null;
+
+		const loadPackageCredentials = (): Set<string> => {
+			if (packageCredentials !== null) {
+				return packageCredentials;
+			}
+
+			const packageJsonPath = findPackageJson(context.filename);
+			if (!packageJsonPath) {
+				packageCredentials = new Set();
+				return packageCredentials;
+			}
+
+			packageCredentials = readPackageJsonCredentials(packageJsonPath);
+			return packageCredentials;
+		};
+
+		return {
+			ClassDeclaration(node) {
+				if (!isNodeTypeClass(node)) {
+					return;
+				}
+
+				const descriptionProperty = findClassProperty(node, 'description');
+				if (
+					!descriptionProperty?.value ||
+					descriptionProperty.value.type !== TSESTree.AST_NODE_TYPES.ObjectExpression
+				) {
+					return;
+				}
+
+				const credentialsArray = findArrayLiteralProperty(descriptionProperty.value, 'credentials');
+				if (!credentialsArray) {
+					return;
+				}
+
+				const knownCredentials = loadPackageCredentials();
+				if (knownCredentials.size === 0) {
+					return;
+				}
+
+				credentialsArray.elements.forEach((element) => {
+					const credentialInfo = extractCredentialNameFromArray(element);
+					if (!credentialInfo || knownCredentials.has(credentialInfo.name)) {
+						return;
+					}
+
+					const similar = findSimilarStrings(credentialInfo.name, knownCredentials);
+					const suggestions: ReportSuggestionArray<'credentialNotFound' | 'didYouMean'> =
+						similar.map((suggestedName) => ({
+							messageId: 'didYouMean' as const,
+							data: { suggestedName },
+							fix(fixer) {
+								return fixer.replaceText(credentialInfo.node, `"${suggestedName}"`);
+							},
+						}));
+
+					context.report({
+						node: credentialInfo.node,
+						messageId: 'credentialNotFound',
+						data: { credentialName: credentialInfo.name },
+						suggest: suggestions,
+					});
+				});
+			},
+		};
+	},
+});
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/webhook-lifecycle-complete.test.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/webhook-lifecycle-complete.test.ts
@ -107,6 +107,11 @@ export class RegularClass {
 			code: createTriggerNode({ webhookMethods: null }),
 			errors: [{ messageId: 'missingWebhookMethods' }],
 		},
+		{
+			name: 'trigger node with empty webhookMethods object (no lifecycle groups)',
+			code: createTriggerNode({ webhookMethods: '{}' }),
+			errors: [{ messageId: 'emptyWebhookMethods' }],
+		},
 		{
 			name: 'trigger node with empty webhookMethods group (all three missing)',
 			code: createTriggerNode({
--- a/packages/@n8n/eslint-plugin-community-nodes/src/rules/webhook-lifecycle-complete.ts
+++ b/packages/@n8n/eslint-plugin-community-nodes/src/rules/webhook-lifecycle-complete.ts
@ -56,6 +56,8 @@ export const WebhookLifecycleCompleteRule = createRule({
 		messages: {
 			missingWebhookMethods:
 				'Webhook trigger node is missing the `webhookMethods` property. Implement `checkExists`, `create`, and `delete` to register, verify, and clean up the webhook on the third-party service.',
+			emptyWebhookMethods:
+				'Webhook trigger node has an empty `webhookMethods` object. Define at least one lifecycle group with `checkExists`, `create`, and `delete` methods.',
 			missingLifecycleMethod:
 				'Webhook trigger lifecycle is incomplete. `webhookMethods.{{group}}` is missing: {{missing}}. All of `checkExists`, `create`, and `delete` must be implemented.',
 		},
@ -91,6 +93,14 @@ export const WebhookLifecycleCompleteRule = createRule({
 					return;
 				}

+				if (webhookMethodsProperty.value.properties.length === 0) {
+					context.report({
+						node: webhookMethodsProperty.key,
+						messageId: 'emptyWebhookMethods',
+					});
+					return;
+				}
+
 				for (const groupProperty of webhookMethodsProperty.value.properties) {
 					if (groupProperty.type !== AST_NODE_TYPES.Property) continue;
 					if (groupProperty.value.type !== AST_NODE_TYPES.ObjectExpression) continue;
--- a/packages/@n8n/expression-runtime/package.json
+++ b/packages/@n8n/expression-runtime/package.json
@ -1,6 +1,6 @@
 {
  "name": "@n8n/expression-runtime",
-  "version": "0.11.0",
+  "version": "0.12.0",
  "description": "Secure, isolated expression evaluation runtime for n8n",
  "main": "dist/cjs/index.js",
  "module": "dist/esm/index.js",
--- a/packages/@n8n/instance-ai/.gitignore
+++ b/packages/@n8n/instance-ai/.gitignore
@ -0,0 +1 @@
+.output/
--- a/packages/@n8n/instance-ai/docs/architecture.md
+++ b/packages/@n8n/instance-ai/docs/architecture.md
@ -382,14 +382,21 @@ The processor is configurable via `disableDeferredTools` flag.

 ## MCP Integration

-External MCP servers are connected via `McpClientManager`. Their tools are:
+External MCP servers are owned by `McpClientManager` (`mcp/mcp-client-manager.ts`).
+The cli's `InstanceAiService` holds one manager instance and passes it to
+`createInstanceAgent` via options; the agent factory calls
+`mcpManager.getRegularTools(mcpServers)` and
+`mcpManager.getBrowserTools(orchestrationContext?.browserMcpConfig)`. Tool
+descriptions are:

 1. **Schema-sanitized** for Anthropic compatibility (ZodNull → optional,
   discriminated unions → flattened objects, array types → recursive element fix)
 2. **Name-checked** against reserved domain tool names (prevents malicious
   shadowing of tools like `run-workflow`)
 3. **Separated** from domain tools in the orchestrator's tool set
-4. **Cached** by config hash across agent instances
+4. **Cached** by config hash inside the manager — the underlying `MCPClient`
+   instances are tracked so `mcpManager.disconnect()` (called during service
+   shutdown) closes SSE / stdio connections cleanly.

 Browser MCP tools (Chrome DevTools) are excluded from the orchestrator to avoid
 context bloat from screenshots. They're available to `browser-credential-setup`
--- a/packages/@n8n/instance-ai/evaluations/README.md
+++ b/packages/@n8n/instance-ai/evaluations/README.md
@ -2,7 +2,20 @@

 Tests whether workflows built by Instance AI actually work by executing them with LLM-generated mock HTTP responses. No real credentials or external services are involved.

-## How it works
+Three harnesses live here:
+
+- **`eval:instance-ai`** — end-to-end build + mocked execution + LLM verification (drives a running n8n instance)
+- **`eval:subagent`** — builder sub-agent against live n8n, scored by binary checks (drives a running n8n instance)
+- **`eval:pairwise`** — builder sub-agent in-process, scored by an LLM judge panel against do/don't lists (no n8n server). Intended for head-to-head comparison with `ai-workflow-builder.ee` on the same dataset
+
+Sections:
+
+- [Running e2e + sub-agent evals](#running-evals)
+- [Running pairwise evals](#pairwise-evals)
+- [How the e2e harness works](#how-the-e2e-harness-works)
+- [How the sub-agent harness works](#how-the-sub-agent-harness-works)
+
+## Running evals

 Each run:

@ -142,9 +155,194 @@ Every run produces:

 **LangSmith caveat:** if `LANGSMITH_API_KEY` is set in `.env.local`, local runs also land in the shared `instance-ai-workflow-evals` dataset. Unset it (or run without `dotenvx`) to keep exploratory runs out of team results.

-## Writing test cases
+## Pairwise evals

-Test cases live in `evaluations/data/workflows/*.json`. Drop a file in, the CLI and LangSmith sync pick it up — no registration step.
+Pairwise evals score a built workflow against the dataset's `dos` / `donts`
+criteria using an LLM judge panel (3 judges by default, majority vote on
+`pairwise_primary`, mean fraction of criteria satisfied on
+`pairwise_diagnostic`). The point is **head-to-head comparison with
+`ai-workflow-builder.ee`** on the same dataset (default
+`notion-pairwise-workflows`), so the judge panel, defaults, and metric keys
+are imported from that package directly.
+
+Unlike the e2e and sub-agent harnesses, pairwise runs the **builder
+sub-agent in-process** — no n8n server, no Docker, no live workflow service.
+Stub services capture `createFromWorkflowJSON` calls; HITL suspensions are
+auto-approved.
+
+### Quick start
+
+```bash
+# From packages/@n8n/instance-ai/
+
+# 1. Local fixture (small smoke set, no LangSmith required)
+N8N_AI_ANTHROPIC_KEY="$ANTHROPIC_API_KEY" pnpm eval:pairwise --judges 1
+
+# 2. Full LangSmith dataset
+LANGSMITH_API_KEY=... N8N_AI_ANTHROPIC_KEY="$ANTHROPIC_API_KEY" \
+  pnpm eval:pairwise:langsmith --judges 3
+
+# 3. Rerun a specific subset (one example ID per line; #-prefixed lines ignored)
+pnpm eval:pairwise:langsmith \
+  --example-ids-file .output/pairwise/failed-ids.txt \
+  --output-dir .output/pairwise/rerun
+```
+
+### Sandbox
+
+Pairwise evals always run inside a sandbox — the same path production uses.
+The agent writes TypeScript to `~/workspace/src/workflow.ts` inside the
+sandbox, runs `tsc` to validate, and calls `submit-workflow` to save the
+parsed `WorkflowJSON`. This exercises the production builder agent
+end-to-end (sandbox prompt, file I/O, real type checking).
+
+Required env vars (Daytona provider — the default):
+
+```bash
+ANTHROPIC_API_KEY=sk-ant-...           # builder + judge LLM
+LANGSMITH_API_KEY=ls__...              # only for --backend langsmith
+DAYTONA_API_URL=https://app.daytona.io/api
+DAYTONA_API_KEY=dtn_...
+
+# Optional
+N8N_INSTANCE_AI_SANDBOX_PROVIDER=daytona      # default; set 'local' or 'n8n-sandbox' to switch
+N8N_INSTANCE_AI_SANDBOX_IMAGE=daytonaio/sandbox:0.5.0   # default
+N8N_INSTANCE_AI_SANDBOX_TIMEOUT=300000        # per-command timeout (ms)
+```
+
+The CLI fails fast at startup if the chosen provider is misconfigured (e.g.,
+Daytona selected without API URL/key). The chosen provider is recorded under
+`summary.json → sandbox.provider`.
+
+> **Daytona cold-start.** The very first sandbox creation triggers an image
+> build on Daytona's side (`npm install` for `@n8n/workflow-sdk`). That can
+> exceed the SDK's 5-minute create timeout and fail with `Sandbox failed to
+> become ready within the timeout period`. Once the image is cached, later
+> runs are fast. Workaround: pre-build the image via the Daytona dashboard
+> before kicking off a full eval run.
+
+### Flags
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--backend` | `local` | `local` reads `evaluations/data/pairwise/local.json`; `langsmith` pulls from the LangSmith dataset |
+| `--dataset` | `notion-pairwise-workflows` | LangSmith dataset name (langsmith backend only) |
+| `--judges` | `3` | Number of judges in the LLM panel |
+| `--judge-model` | `claude-sonnet-4-5-20250929` | LangChain model id for the judge LLM |
+| `--iterations` | `1` | Run each example N times — for measuring judge / build variance |
+| `--concurrency` | `5` | Parallel example workers (`p-limit`) |
+| `--max-examples` | — | Cap dataset to first N examples |
+| `--example-ids-file` | — | Path to a text file of LangSmith example IDs (one per line). Used for rerunning a subset |
+| `--timeout-ms` | `1200000` | Per-example build timeout |
+| `--output-dir` | `.output/pairwise/<iso>` | Where to write artifacts |
+| `--experiment-name` | `pairwise-evals-instance-ai` | LangSmith experiment label |
+| `--verbose` | `false` | Per-example log lines |
+
+### Outputs
+
+Each run writes a self-contained directory:
+
+```
+.output/pairwise/<run>/
+├── summary.json           # totals: pass rate, avg diagnostic, build failures by class, interactivity counters
+├── results.jsonl          # one line per example: prompt, dos/donts, captured workflow, build metadata, feedback rows
+├── workflows/<id>.json    # normalized workflow JSON (matches SimpleWorkflow shape from ai-workflow-builder.ee)
+└── chunks/<id>_<iter>.jsonl  # per-example agent trace: tool-calls, tool-results, suspensions, final text
+```
+
+The `chunks/*.jsonl` traces are the primary tool for root-causing build
+failures. Each line is one event: `tool-call`, `tool-result`, `suspension`,
+`auto-approve`, `text`, `stream-finish`, `captured-workflows`, `error`.
+
+When `LANGSMITH_API_KEY` is set, feedback is also posted to LangSmith with
+metric keys `pairwise_primary`, `pairwise_diagnostic`,
+`pairwise_judges_passed`, `pairwise_total_passes`, `pairwise_total_violations`,
+and per-judge `judge1..N`. Experiment metadata includes
+`builder: 'instance-ai'` so it can be queried alongside the
+`ai-workflow-builder.ee` baseline.
+
+### Build failure classes
+
+Build failures are tracked separately from judge scores:
+
+- **`build_timeout`** — exceeded `--timeout-ms`
+- **`no_workflow_built`** — agent finished without invoking `build-workflow` (no captured workflow)
+- **`agent_error`** — stream errored or the agent threw
+
+A failure produces a row with `workflow: null`, empty `feedback`, and the
+error class — it counts as a primary fail in the comparison report.
+
+### Interactivity gates
+
+The agent is stubbed for non-interactive use. The summary tracks divergence
+from this assumption — investigate any non-zero count:
+
+- `askUserCount` — `ask-user` tool was invoked (eval responds with `{ approved: false }`)
+- `planToolCount` — `plan` tool was invoked (single-prompt dataset shouldn't trigger planning)
+- `autoApprovedSuspensions` — HITL-gated tool fired (e.g., `data-tables` create); auto-approved
+- `mockedCredentialTypes` — credential types the agent referenced (auto-mocked since `credentialService.list()` returns `[]`)
+
+### Comparison report
+
+After running both `ai-workflow-builder.ee/evaluations/cli` (the baseline) and
+`eval:pairwise` against the same dataset, generate an HTML side-by-side
+report:
+
+```bash
+pnpm eval:pairwise:compare \
+  --ee-dir   ../ai-workflow-builder.ee/evaluations/.output/pairwise/<ts> \
+  --ia-dir   .output/pairwise/<ts> \
+  --out      .output/pairwise/comparison.html
+```
+
+The report shows headline metrics, per-prompt verdicts (TIE / IA-only /
+Code-only / both-pass / both-fail), and lazy-loaded workflow previews — rows
+collapse by default and only render the heavy `<n8n-demo>` preview when
+expanded.
+
+### When pairwise scores wobble
+
+Judge non-determinism + agent retry behavior mean a single run is not a
+reliable signal. Two specific things to know:
+
+- The agent will sometimes retry `build-workflow` after a parser rejection
+  (e.g., security violation) and sometimes give up. Whether a prompt
+  "fails to build" is non-deterministic across runs.
+- If you're comparing two builders to claim a regression or improvement,
+  bump `--iterations` to ≥3 for both sides.
+
+## How the e2e harness works
+
+1. **Build** — sends the test case prompt to Instance AI, which builds a workflow
+2. **Phase 1** — analyzes the workflow and generates consistent mock data hints (one Sonnet call per scenario)
+3. **Phase 2** — executes the workflow with all HTTP requests intercepted. Each request goes to an LLM that generates a realistic API response using the node's configuration and API documentation from Context7
+4. **Verify** — an LLM evaluates whether the scenario's success criteria were met and categorizes any failure by root cause (see Failure categories below)
+
+### What gets mocked
+
+- **Mocked nodes** — any node that makes HTTP requests (Gmail, Slack, Google Sheets, HTTP Request, Notion, etc.). The request is intercepted before it leaves the process. An LLM generates the response.
+- **Pinned nodes** — nodes that don't go through the HTTP layer: trigger/webhook nodes, LangChain/AI nodes (they use SDKs directly), database nodes. These receive LLM-generated data as pin data.
+- **Real nodes** — logic nodes (Code, Set, Merge, Filter, IF, Switch) execute their actual code on the mocked/pinned data.
+
+No real credentials or API connections are needed. ~95% of node types are covered; the main gaps are binary-data nodes (file attachments, image generation) and streaming nodes.
+
+## How the sub-agent harness works
+
+1. The CLI logs in to n8n with `N8N_EVAL_EMAIL` / `N8N_EVAL_PASSWORD`.
+2. For each test case it POSTs `/rest/instance-ai/eval/run-sub-agent`.
+3. The server builds a real `InstanceAiContext` via `InstanceAiAdapterService.createContext`, wraps the workflow service to record created IDs, resolves the `builder` (or other) role's system prompt, instantiates the sub-agent with the full `createAllTools(context)` tool surface, and runs it to completion.
+4. The server returns `{ text, toolCalls, toolResults, capturedWorkflowIds, ... }`.
+5. The CLI fetches each captured workflow via `GET /rest/workflows/:id` (this doubles as a round-trip check through the real importer), scores it with the binary-check suite, and archives+deletes it (unless `--keep-workflows`).
+
+No tools, services, or workflow imports are mocked. The server path exercised here is the same one the orchestrator takes when it spawns a builder sub-agent.
+
+## LangSmith integration
+
+When `LANGSMITH_API_KEY` is set, each run is recorded as a LangSmith experiment against the `instance-ai-workflow-evals` dataset (synced from the JSON files before each run). Experiments against the same dataset can be compared side-by-side to spot regressions.
+
+## Adding test cases
+
+Test cases live in `evaluations/data/workflows/*.json`. Drop a file in, the CLI and LangSmith sync picks it up — no registration step.

 ```json
 {
@ -225,12 +423,13 @@ The job is **non-blocking**. Results are posted as a PR comment and uploaded as
 ```
 evaluations/
 ├── index.ts              # Public API
-├── cli/                  # CLI entry point, arg parsing, CI metadata
+├── cli/                  # CLI entries: instance-ai, subagent, pairwise, compare-pairwise, report
 ├── clients/              # n8n REST + SSE clients
 ├── checklist/            # LLM verification with retry
 ├── credentials/          # Test credential seeding
-├── data/workflows/       # Test case JSON files
-├── harness/              # Runner: buildWorkflow, executeScenario, cleanupBuild
+├── data/workflows/       # e2e/sub-agent test case JSON files
+├── data/pairwise/        # Local pairwise fixture (small smoke set)
+├── harness/              # Runners: buildWorkflow + executeScenario (e2e), in-process-builder (pairwise)
 ├── langsmith/            # Dataset sync + experiment setup
 ├── outcome/              # SSE event parsing, workflow discovery
 ├── report/               # HTML report generator
--- a/packages/@n8n/instance-ai/evaluations/tests/normalize-workflow.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/normalize-workflow.test.ts
@ -0,0 +1,158 @@
+/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access */
+// `SimpleWorkflow` (the return type of `normalizeWorkflow`) is imported from
+// `ai-workflow-builder.ee` via deep relative paths into source files that use
+// a `@/*` path alias. That alias collides with instance-ai's own `@/*` mapping
+// when type-checked transitively, so the type resolves to `error` here even
+// though the runtime behaviour is correct.
+import type { WorkflowJSON } from '@n8n/workflow-sdk';
+
+import { normalizeWorkflow, serializeNormalizedWorkflow } from '../harness/normalize-workflow';
+
+// ---------------------------------------------------------------------------
+// normalizeWorkflow
+// ---------------------------------------------------------------------------
+
+describe('normalizeWorkflow', () => {
+	it('drops server-assigned and transient fields from the top level', () => {
+		const raw: WorkflowJSON = {
+			id: 'wf-123',
+			name: 'My Workflow',
+			nodes: [],
+			connections: {},
+			settings: { executionOrder: 'v1' },
+			pinData: { node1: [{ foo: 'bar' }] },
+			meta: { instanceId: 'abc' },
+		};
+
+		const result = normalizeWorkflow(raw);
+
+		expect(result).toEqual({
+			name: 'My Workflow',
+			nodes: [],
+			connections: {},
+		});
+		expect(result).not.toHaveProperty('id');
+		expect(result).not.toHaveProperty('settings');
+		expect(result).not.toHaveProperty('pinData');
+		expect(result).not.toHaveProperty('meta');
+	});
+
+	it('preserves node ordering and semantic fields', () => {
+		const raw: WorkflowJSON = {
+			name: 'Two-node workflow',
+			nodes: [
+				{
+					id: 'n1',
+					name: 'Trigger',
+					type: 'n8n-nodes-base.manualTrigger',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: {},
+				},
+				{
+					id: 'n2',
+					name: 'HTTP Request',
+					type: 'n8n-nodes-base.httpRequest',
+					typeVersion: 4,
+					position: [200, 0],
+					parameters: { url: 'https://example.com' },
+					credentials: { httpBasicAuth: { id: 'cred1', name: 'auth' } },
+				},
+			],
+			connections: {
+				Trigger: {
+					main: [[{ node: 'HTTP Request', type: 'main', index: 0 }]],
+				},
+			},
+		};
+
+		const result = normalizeWorkflow(raw);
+
+		expect(result.nodes).toHaveLength(2);
+		expect(result.nodes[0].name).toBe('Trigger');
+		expect(result.nodes[1].name).toBe('HTTP Request');
+		expect(result.nodes[1].credentials).toEqual({
+			httpBasicAuth: { id: 'cred1', name: 'auth' },
+		});
+		expect(result.connections).toEqual(raw.connections);
+	});
+
+	it('defaults missing parameters to an empty object so judges see stable shape', () => {
+		const raw: WorkflowJSON = {
+			name: 'No-param workflow',
+			nodes: [
+				{
+					id: 'n1',
+					name: 'Start',
+					type: 'n8n-nodes-base.manualTrigger',
+					typeVersion: 1,
+					position: [0, 0],
+				},
+			],
+			connections: {},
+		};
+
+		const result = normalizeWorkflow(raw);
+
+		expect(result.nodes[0].parameters).toEqual({});
+	});
+
+	it('falls back to node id when name is missing (sticky notes etc.)', () => {
+		const raw: WorkflowJSON = {
+			name: 'Sticky-note workflow',
+			nodes: [
+				{
+					id: 'sticky-1',
+					type: 'n8n-nodes-base.stickyNote',
+					typeVersion: 1,
+					position: [0, 0],
+				},
+			],
+			connections: {},
+		};
+
+		const result = normalizeWorkflow(raw);
+
+		expect(result.nodes[0].name).toBe('sticky-1');
+	});
+});
+
+// ---------------------------------------------------------------------------
+// serializeNormalizedWorkflow
+// ---------------------------------------------------------------------------
+
+describe('serializeNormalizedWorkflow', () => {
+	it('produces byte-identical output regardless of key insertion order', () => {
+		const a = normalizeWorkflow({
+			name: 'Order test',
+			nodes: [
+				{
+					id: 'n1',
+					name: 'Node',
+					type: 't',
+					typeVersion: 1,
+					position: [0, 0],
+					parameters: { z: 1, a: 2 },
+				},
+			],
+			connections: { Node: { main: [] } },
+		});
+
+		const b = normalizeWorkflow({
+			connections: { Node: { main: [] } },
+			name: 'Order test',
+			nodes: [
+				{
+					position: [0, 0],
+					typeVersion: 1,
+					type: 't',
+					name: 'Node',
+					id: 'n1',
+					parameters: { a: 2, z: 1 },
+				},
+			],
+		});
+
+		expect(serializeNormalizedWorkflow(a)).toBe(serializeNormalizedWorkflow(b));
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/pair-records.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/pair-records.test.ts
@ -0,0 +1,128 @@
+import { pairRecords, promptJoinKey } from '../cli/compare-pairwise';
+import type { BuilderRecord, FeedbackEntry } from '../cli/compare-pairwise';
+
+const buildRecord = (overrides: Partial<BuilderRecord> = {}): BuilderRecord => ({
+	prompt: 'Build a workflow',
+	workflow: { name: 'wf' },
+	durationMs: 100,
+	success: true,
+	feedback: [],
+	...overrides,
+});
+
+const passFeedback = (): FeedbackEntry[] => [{ metric: 'pairwise_primary', score: 1 }];
+const failFeedback = (): FeedbackEntry[] => [{ metric: 'pairwise_primary', score: 0 }];
+
+// ---------------------------------------------------------------------------
+// promptJoinKey
+// ---------------------------------------------------------------------------
+
+describe('promptJoinKey', () => {
+	it('collapses whitespace and trims', () => {
+		expect(promptJoinKey('  build\n a  workflow\t')).toBe('build a workflow');
+	});
+
+	it('treats CRLF and LF as equivalent', () => {
+		expect(promptJoinKey('a\r\nb')).toBe(promptJoinKey('a\nb'));
+	});
+
+	it('matches identical content with different indentation', () => {
+		const a = 'Step 1\n\tStep 2';
+		const b = 'Step 1   Step 2';
+		expect(promptJoinKey(a)).toBe(promptJoinKey(b));
+	});
+});
+
+// ---------------------------------------------------------------------------
+// pairRecords verdict matrix
+// ---------------------------------------------------------------------------
+
+describe('pairRecords', () => {
+	it('produces a both-pass verdict when both builders pass primary', () => {
+		const ee = [buildRecord({ prompt: 'A', feedback: passFeedback() })];
+		const ia = [buildRecord({ prompt: 'A', feedback: passFeedback() })];
+		const rows = pairRecords(ee, ia);
+		expect(rows).toHaveLength(1);
+		expect(rows[0].verdict).toBe('both-pass');
+	});
+
+	it('produces a both-fail verdict when neither builder passes', () => {
+		const ee = [buildRecord({ prompt: 'A', feedback: failFeedback() })];
+		const ia = [buildRecord({ prompt: 'A', feedback: failFeedback() })];
+		expect(pairRecords(ee, ia)[0].verdict).toBe('both-fail');
+	});
+
+	it('produces an ee-only verdict when only EE passes', () => {
+		const ee = [buildRecord({ prompt: 'A', feedback: passFeedback() })];
+		const ia = [buildRecord({ prompt: 'A', feedback: failFeedback() })];
+		expect(pairRecords(ee, ia)[0].verdict).toBe('ee-only');
+	});
+
+	it('produces an ia-only verdict when only IA passes', () => {
+		const ee = [buildRecord({ prompt: 'A', feedback: failFeedback() })];
+		const ia = [buildRecord({ prompt: 'A', feedback: passFeedback() })];
+		expect(pairRecords(ee, ia)[0].verdict).toBe('ia-only');
+	});
+
+	it('counts a build failure as a fail in the verdict', () => {
+		const ee = [
+			buildRecord({ prompt: 'A', success: false, errorClass: 'build_timeout', feedback: [] }),
+		];
+		const ia = [buildRecord({ prompt: 'A', feedback: passFeedback() })];
+		expect(pairRecords(ee, ia)[0].verdict).toBe('ia-only');
+	});
+
+	it('counts a built-but-unscored row (success=true, no primary feedback) as fail', () => {
+		const ee = [buildRecord({ prompt: 'A', success: true, feedback: [] })];
+		const ia = [buildRecord({ prompt: 'A', feedback: passFeedback() })];
+		expect(pairRecords(ee, ia)[0].verdict).toBe('ia-only');
+	});
+
+	it('joins records whose prompts differ only in whitespace', () => {
+		const ee = [buildRecord({ prompt: 'A B', feedback: passFeedback() })];
+		const ia = [buildRecord({ prompt: ' A   B \n', feedback: passFeedback() })];
+		const rows = pairRecords(ee, ia);
+		expect(rows).toHaveLength(1);
+		expect(rows[0].ee).toBe(ee[0]);
+		expect(rows[0].ia).toBe(ia[0]);
+		expect(rows[0].verdict).toBe('both-pass');
+	});
+
+	it('keeps unmatched rows with the missing-side undefined', () => {
+		const ee = [buildRecord({ prompt: 'A', feedback: passFeedback() })];
+		const ia = [buildRecord({ prompt: 'B', feedback: passFeedback() })];
+		const rows = pairRecords(ee, ia);
+		expect(rows).toHaveLength(2);
+		const a = rows.find((r) => r.prompt === 'A');
+		const b = rows.find((r) => r.prompt === 'B');
+		expect(a?.ee).toBeDefined();
+		expect(a?.ia).toBeUndefined();
+		expect(b?.ee).toBeUndefined();
+		expect(b?.ia).toBeDefined();
+	});
+
+	it('prefers IA criteria when both are present', () => {
+		const ee = [buildRecord({ prompt: 'A', dos: 'ee-do', donts: 'ee-dont' })];
+		const ia = [buildRecord({ prompt: 'A', dos: 'ia-do', donts: 'ia-dont' })];
+		const row = pairRecords(ee, ia)[0];
+		expect(row.dos).toBe('ia-do');
+		expect(row.donts).toBe('ia-dont');
+	});
+
+	it('orders rows: ee-only, ia-only, both-fail, both-pass', () => {
+		const ee = [
+			buildRecord({ prompt: 'pass-pass', feedback: passFeedback() }),
+			buildRecord({ prompt: 'ee-only', feedback: passFeedback() }),
+			buildRecord({ prompt: 'both-fail', feedback: failFeedback() }),
+			buildRecord({ prompt: 'ia-only', feedback: failFeedback() }),
+		];
+		const ia = [
+			buildRecord({ prompt: 'pass-pass', feedback: passFeedback() }),
+			buildRecord({ prompt: 'ee-only', feedback: failFeedback() }),
+			buildRecord({ prompt: 'both-fail', feedback: failFeedback() }),
+			buildRecord({ prompt: 'ia-only', feedback: passFeedback() }),
+		];
+		const verdicts = pairRecords(ee, ia).map((r) => r.verdict);
+		expect(verdicts).toEqual(['ee-only', 'ia-only', 'both-fail', 'both-pass']);
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/redact.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/redact.test.ts
@ -0,0 +1,139 @@
+import { redactSecrets, stringifyError, truncate } from '../harness/redact';
+
+describe('redactSecrets', () => {
+	it('redacts values under secret-shaped keys', () => {
+		const input = {
+			username: 'alice',
+			password: 'hunter2',
+			apiKey: 'sk-abc',
+			api_key: 'sk-def',
+			'X-Api-Key': 'sk-ghi',
+			authorization: 'Bearer xyz',
+			refreshToken: 'rt-1',
+			cookie: 'sid=1',
+			privateKey: '----BEGIN----',
+			sessionId: 's-1',
+			credentials: { value: 'opaque' },
+		};
+
+		expect(redactSecrets(input)).toEqual({
+			username: 'alice',
+			password: '[REDACTED]',
+			apiKey: '[REDACTED]',
+			api_key: '[REDACTED]',
+			'X-Api-Key': '[REDACTED]',
+			authorization: '[REDACTED]',
+			refreshToken: '[REDACTED]',
+			cookie: '[REDACTED]',
+			privateKey: '[REDACTED]',
+			sessionId: '[REDACTED]',
+			credentials: '[REDACTED]',
+		});
+	});
+
+	it('walks nested objects and arrays', () => {
+		const input = {
+			outer: {
+				inner: { token: 't', name: 'ok' },
+				list: [{ password: 'p', kept: 'k' }],
+			},
+		};
+
+		expect(redactSecrets(input)).toEqual({
+			outer: {
+				inner: { token: '[REDACTED]', name: 'ok' },
+				list: [{ password: '[REDACTED]', kept: 'k' }],
+			},
+		});
+	});
+
+	it('passes primitives, null, and undefined through unchanged', () => {
+		expect(redactSecrets('plain')).toBe('plain');
+		expect(redactSecrets(42)).toBe(42);
+		expect(redactSecrets(true)).toBe(true);
+		expect(redactSecrets(null)).toBeNull();
+		expect(redactSecrets(undefined)).toBeUndefined();
+	});
+
+	it('does not mutate the original object', () => {
+		const original = { token: 'real-token' };
+		redactSecrets(original);
+		expect(original.token).toBe('real-token');
+	});
+
+	it('caps recursion depth so deeply nested input cannot blow the stack', () => {
+		let nested: unknown = { token: 'leaf' };
+		for (let i = 0; i < 12; i += 1) {
+			nested = { wrap: nested };
+		}
+		expect(() => redactSecrets(nested)).not.toThrow();
+	});
+
+	it('leaves class instances untouched (only redacts plain objects)', () => {
+		class WithSecret {
+			constructor(public token: string) {}
+		}
+		const instance = new WithSecret('keep-me');
+		expect(redactSecrets(instance)).toBe(instance);
+	});
+});
+
+describe('truncate', () => {
+	it('passes short values through after redaction', () => {
+		expect(truncate({ name: 'a', token: 't' }, 200)).toEqual({ name: 'a', token: '[REDACTED]' });
+	});
+
+	it('returns the truncated stringified form when over the limit', () => {
+		const big = { msg: 'a'.repeat(500) };
+		const out = truncate(big, 50);
+		expect(typeof out).toBe('string');
+		expect((out as string).endsWith('... [truncated]')).toBe(true);
+	});
+
+	it('returns "<unserializable>" when JSON.stringify throws on circular refs', () => {
+		const circular: Record<string, unknown> = {};
+		circular.self = circular;
+		expect(truncate(circular, 200)).toBe('<unserializable>');
+	});
+
+	it('returns "<unserializable>" when JSON.stringify returns undefined', () => {
+		const fn = (): void => {};
+		expect(truncate(fn, 200)).toBe('<unserializable>');
+	});
+});
+
+describe('stringifyError', () => {
+	it('returns string errors unchanged when within limit', () => {
+		expect(stringifyError('boom', 100)).toBe('boom');
+	});
+
+	it('truncates long string errors', () => {
+		const long = 'x'.repeat(50);
+		expect(stringifyError(long, 10)).toBe('xxxxxxxxxx');
+	});
+
+	it('JSON-stringifies object errors and redacts secrets', () => {
+		const out = stringifyError({ message: 'fail', token: 'leaked' }, 200);
+		expect(out).toContain('"message":"fail"');
+		expect(out).toContain('[REDACTED]');
+		expect(out).not.toContain('leaked');
+	});
+
+	it('falls back to String() when JSON.stringify returns undefined', () => {
+		const fn = (): void => {};
+		expect(stringifyError(fn, 200)).toBe(String(fn));
+	});
+
+	it('falls back to String() when JSON.stringify throws on circular refs', () => {
+		const circular: Record<string, unknown> = { name: 'cycle' };
+		circular.self = circular;
+		expect(() => stringifyError(circular, 200)).not.toThrow();
+		expect(typeof stringifyError(circular, 200)).toBe('string');
+	});
+
+	it('truncates serialized object errors past max length', () => {
+		const big = { msg: 'a'.repeat(500) };
+		const out = stringifyError(big, 50);
+		expect(out.length).toBe(50);
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/sandbox-config.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/sandbox-config.test.ts
@ -0,0 +1,105 @@
+import { resolveSandboxConfig } from '../harness/sandbox-config';
+
+const baseEnv = (extras: Record<string, string | undefined> = {}): NodeJS.ProcessEnv => {
+	const env: NodeJS.ProcessEnv = {};
+	for (const [k, v] of Object.entries(extras)) {
+		if (v !== undefined) env[k] = v;
+	}
+	return env;
+};
+
+describe('resolveSandboxConfig', () => {
+	it('returns a daytona config when DAYTONA env vars are set', () => {
+		const env = baseEnv({
+			DAYTONA_API_URL: 'https://app.daytona.io/api',
+			DAYTONA_API_KEY: 'dtn_xxx',
+		});
+		const config = resolveSandboxConfig(env);
+		expect(config).toEqual({
+			enabled: true,
+			provider: 'daytona',
+			daytonaApiUrl: 'https://app.daytona.io/api',
+			daytonaApiKey: 'dtn_xxx',
+			timeout: 300_000,
+			createTimeoutSeconds: 900,
+		});
+	});
+
+	it('forwards optional image + timeout overrides', () => {
+		const env = baseEnv({
+			DAYTONA_API_URL: 'https://app.daytona.io/api',
+			DAYTONA_API_KEY: 'dtn_xxx',
+			N8N_INSTANCE_AI_SANDBOX_IMAGE: 'custom/image:1.0',
+			N8N_INSTANCE_AI_SANDBOX_TIMEOUT: '600000',
+		});
+		const config = resolveSandboxConfig(env);
+		if (!config.enabled || config.provider !== 'daytona') throw new Error('expected daytona');
+		expect(config.image).toBe('custom/image:1.0');
+		expect(config.timeout).toBe(600_000);
+		expect(config.createTimeoutSeconds).toBe(900);
+	});
+
+	it('honors a custom createTimeoutSeconds env override', () => {
+		const env = baseEnv({
+			DAYTONA_API_URL: 'https://app.daytona.io/api',
+			DAYTONA_API_KEY: 'dtn_xxx',
+			N8N_INSTANCE_AI_SANDBOX_CREATE_TIMEOUT_SECONDS: '1800',
+		});
+		const config = resolveSandboxConfig(env);
+		if (!config.enabled || config.provider !== 'daytona') throw new Error('expected daytona');
+		expect(config.createTimeoutSeconds).toBe(1800);
+	});
+
+	it('rejects a non-integer createTimeoutSeconds', () => {
+		const env = baseEnv({
+			DAYTONA_API_URL: 'https://app.daytona.io/api',
+			DAYTONA_API_KEY: 'dtn_xxx',
+			N8N_INSTANCE_AI_SANDBOX_CREATE_TIMEOUT_SECONDS: 'not-a-number',
+		});
+		expect(() => resolveSandboxConfig(env)).toThrow(
+			/N8N_INSTANCE_AI_SANDBOX_CREATE_TIMEOUT_SECONDS/,
+		);
+	});
+
+	it('throws a clear error when DAYTONA_API_KEY is missing', () => {
+		const env = baseEnv({ DAYTONA_API_URL: 'https://app.daytona.io/api' });
+		expect(() => resolveSandboxConfig(env)).toThrow(/DAYTONA_API_KEY/);
+	});
+
+	it('throws a clear error when DAYTONA_API_URL is missing', () => {
+		const env = baseEnv({ DAYTONA_API_KEY: 'dtn_xxx' });
+		expect(() => resolveSandboxConfig(env)).toThrow(/DAYTONA_API_URL/);
+	});
+
+	it('returns a local config when provider=local', () => {
+		const env = baseEnv({ N8N_INSTANCE_AI_SANDBOX_PROVIDER: 'local' });
+		const config = resolveSandboxConfig(env);
+		expect(config).toEqual({ enabled: true, provider: 'local', timeout: 300_000 });
+	});
+
+	it('returns an n8n-sandbox config when provider=n8n-sandbox with serviceUrl', () => {
+		const env = baseEnv({
+			N8N_INSTANCE_AI_SANDBOX_PROVIDER: 'n8n-sandbox',
+			N8N_SANDBOX_SERVICE_URL: 'https://sandbox.example.com',
+			N8N_SANDBOX_SERVICE_API_KEY: 'sb_key',
+		});
+		const config = resolveSandboxConfig(env);
+		expect(config).toEqual({
+			enabled: true,
+			provider: 'n8n-sandbox',
+			serviceUrl: 'https://sandbox.example.com',
+			apiKey: 'sb_key',
+			timeout: 300_000,
+		});
+	});
+
+	it('throws a clear error when provider=n8n-sandbox without serviceUrl', () => {
+		const env = baseEnv({ N8N_INSTANCE_AI_SANDBOX_PROVIDER: 'n8n-sandbox' });
+		expect(() => resolveSandboxConfig(env)).toThrow(/N8N_SANDBOX_SERVICE_URL/);
+	});
+
+	it('rejects an unknown provider', () => {
+		const env = baseEnv({ N8N_INSTANCE_AI_SANDBOX_PROVIDER: 'gvisor' });
+		expect(() => resolveSandboxConfig(env)).toThrow(/provider/);
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/tests/stub-services.test.ts
+++ b/packages/@n8n/instance-ai/evaluations/tests/stub-services.test.ts
@ -0,0 +1,152 @@
+// Covers the eval node-service stub: real metadata coercion from
+// `nodes.json` and discovery of `dist/node-definitions/` dirs. Both
+// behaviors are critical to the pairwise eval producing scores against
+// production-faithful node descriptions rather than stripped-down stubs.
+
+import { promises as fs } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+
+import { createStubServices, resolveEvalNodeDefinitionDirs } from '../harness/stub-services';
+
+async function writeNodesJson(entries: unknown[]): Promise<string> {
+	const dir = await fs.mkdtemp(path.join(tmpdir(), 'eval-stub-services-'));
+	const file = path.join(dir, 'nodes.json');
+	await fs.writeFile(file, JSON.stringify(entries), 'utf8');
+	return file;
+}
+
+describe('createStubServices nodeService.getDescription', () => {
+	it('returns properties, credentials, inputs and outputs from the node catalogue', async () => {
+		const file = await writeNodesJson([
+			{
+				name: 'n8n-nodes-base.httpRequest',
+				displayName: 'HTTP Request',
+				description: 'Make HTTP requests',
+				group: ['input'],
+				version: [1, 2, 3],
+				inputs: ['main'],
+				outputs: ['main'],
+				properties: [
+					{
+						displayName: 'Method',
+						name: 'method',
+						type: 'options',
+						required: true,
+						default: 'GET',
+						options: [
+							{ name: 'GET', value: 'GET' },
+							{ name: 'POST', value: 'POST' },
+						],
+					},
+					{
+						displayName: 'URL',
+						name: 'url',
+						type: 'string',
+						default: '',
+					},
+				],
+				credentials: [
+					{ name: 'httpBasicAuth', required: false },
+					{ name: 'httpHeaderAuth', required: true },
+				],
+			},
+		]);
+
+		const { context } = await createStubServices({ nodesJsonPath: file });
+		const desc = await context.nodeService.getDescription('n8n-nodes-base.httpRequest');
+
+		expect(desc.name).toBe('n8n-nodes-base.httpRequest');
+		expect(desc.displayName).toBe('HTTP Request');
+		// Latest version pulled from the version array.
+		expect(desc.version).toBe(3);
+		expect(desc.group).toEqual(['input']);
+		expect(desc.properties).toHaveLength(2);
+		expect(desc.properties[0]).toMatchObject({
+			displayName: 'Method',
+			name: 'method',
+			type: 'options',
+			required: true,
+			default: 'GET',
+			options: [
+				{ name: 'GET', value: 'GET' },
+				{ name: 'POST', value: 'POST' },
+			],
+		});
+		expect(desc.credentials).toEqual([
+			{ name: 'httpBasicAuth', required: false },
+			{ name: 'httpHeaderAuth', required: true },
+		]);
+		expect(desc.inputs).toEqual(['main']);
+		expect(desc.outputs).toEqual(['main']);
+	});
+
+	it('drops option entries that lack name or a primitive value', async () => {
+		const file = await writeNodesJson([
+			{
+				name: 'n8n-nodes-base.example',
+				displayName: 'Example',
+				version: 1,
+				inputs: ['main'],
+				outputs: ['main'],
+				properties: [
+					{
+						displayName: 'Mode',
+						name: 'mode',
+						type: 'options',
+						options: [
+							{ name: 'Valid', value: 'valid' },
+							// Object value is dropped — runtime assertions in nodes.tool.ts
+							// expect string|number|boolean only.
+							{ name: 'Invalid', value: { nested: true } },
+							// Missing `value` is dropped.
+							{ name: 'NoValue' },
+						],
+					},
+				],
+			},
+		]);
+
+		const { context } = await createStubServices({ nodesJsonPath: file });
+		const desc = await context.nodeService.getDescription('n8n-nodes-base.example');
+		expect(desc.properties[0].options).toEqual([{ name: 'Valid', value: 'valid' }]);
+	});
+
+	it('throws when the node type is not in the catalogue', async () => {
+		const file = await writeNodesJson([
+			{ name: 'n8n-nodes-base.a', displayName: 'A', version: 1, inputs: [], outputs: [] },
+		]);
+		const { context } = await createStubServices({ nodesJsonPath: file });
+		await expect(context.nodeService.getDescription('does-not-exist')).rejects.toThrow(/not found/);
+	});
+});
+
+describe('createStubServices nodeService.getNodeTypeDefinition', () => {
+	it('returns a clear error when no node-definition dirs are available', async () => {
+		// Force the empty-dirs branch by passing in a node catalogue that
+		// references a node we won't have built. We can't easily simulate
+		// "no dirs" without monkeypatching, so we instead exercise the more
+		// likely scenario: the dirs exist but the node id is unknown to the
+		// resolver. That covers the same surface the agent would observe.
+		const file = await writeNodesJson([
+			{ name: 'n8n-nodes-base.unknownNode', displayName: 'X', version: 1, inputs: [], outputs: [] },
+		]);
+		const { context } = await createStubServices({ nodesJsonPath: file });
+		const td = await context.nodeService.getNodeTypeDefinition!('n8n-nodes-base.unknownNode');
+		expect(td).toBeDefined();
+		// Either the dirs are missing entirely (no build) or the node id is
+		// unknown — both surface as an error string with empty content.
+		expect(td?.content).toBe('');
+		expect(td?.error).toBeTruthy();
+	});
+});
+
+describe('resolveEvalNodeDefinitionDirs', () => {
+	it('returns absolute paths and only lists dirs that actually exist', () => {
+		const dirs = resolveEvalNodeDefinitionDirs();
+		for (const dir of dirs) {
+			expect(path.isAbsolute(dir)).toBe(true);
+			expect(dir.endsWith('node-definitions')).toBe(true);
+		}
+	});
+});
--- a/packages/@n8n/instance-ai/evaluations/binaryChecks/checks/inbound-trigger-auth-defaults.ts
+++ b/packages/@n8n/instance-ai/evaluations/binaryChecks/checks/inbound-trigger-auth-defaults.ts
@ -0,0 +1,48 @@
+import type { WorkflowNodeResponse } from '../../clients/n8n-client';
+import type { BinaryCheck } from '../types';
+
+const INBOUND_TRIGGER_TYPES = new Set([
+	'n8n-nodes-base.webhook',
+	'n8n-nodes-base.formTrigger',
+	'@n8n/n8n-nodes-langchain.chatTrigger',
+	'@n8n/n8n-nodes-langchain.mcpTrigger',
+]);
+
+const EXPLICIT_INBOUND_AUTH_PATTERNS = [
+	/\bauthenticated\s+(?:webhook|form|chat|mcp)\b/i,
+	/\b(?:webhook|form|chat|mcp|inbound|incoming)\b.{0,80}\b(?:auth|authenticated|authentication|authorization|bearer|jwt|basic auth|header auth|api key|token|password)\b/i,
+	/\b(?:require|requires|requiring|protect|secure|authenticate)\b.{0,80}\b(?:webhook|form|chat|mcp|inbound|incoming)\b/i,
+	/\b(?:webhook|form|chat|mcp|inbound|incoming)\b.{0,80}\b(?:require|protect|secure|authenticate)\b/i,
+];
+
+function explicitlyRequestsInboundAuth(prompt: string): boolean {
+	return EXPLICIT_INBOUND_AUTH_PATTERNS.some((pattern) => pattern.test(prompt));
+}
+
+function hasNonDefaultAuthentication(node: WorkflowNodeResponse): boolean {
+	const auth = node.parameters?.authentication;
+	return typeof auth === 'string' && auth !== 'none';
+}
+
+function getAuthentication(node: WorkflowNodeResponse): string {
+	const auth = node.parameters?.authentication;
+	return typeof auth === 'string' ? auth : '';
+}
+
+export const inboundTriggerAuthDefaults: BinaryCheck = {
+	name: 'inbound_trigger_auth_defaults',
+	description: 'Inbound trigger nodes keep authentication disabled unless the user asks for it',
+	kind: 'deterministic',
+	run(workflow, ctx) {
+		if (explicitlyRequestsInboundAuth(ctx.prompt)) return { pass: true };
+
+		const issues = (workflow.nodes ?? [])
+			.filter((node) => INBOUND_TRIGGER_TYPES.has(node.type) && hasNonDefaultAuthentication(node))
+			.map((node) => `"${node.name}" sets authentication to "${getAuthentication(node)}"`);
+
+		return {
+			pass: issues.length === 0,
+			...(issues.length > 0 ? { comment: issues.join('; ') } : {}),
+		};
+	},
+};
--- a/packages/@n8n/instance-ai/evaluations/binaryChecks/checks/index.ts
+++ b/packages/@n8n/instance-ai/evaluations/binaryChecks/checks/index.ts
@ -14,6 +14,7 @@ import { handlesMultipleItems } from './handles-multiple-items';
 import { hasNodes } from './has-nodes';
 import { hasStartNode } from './has-start-node';
 import { hasTrigger } from './has-trigger';
+import { inboundTriggerAuthDefaults } from './inbound-trigger-auth-defaults';
 import { memoryProperlyConnected } from './memory-properly-connected';
 import { memorySessionKeyExpression } from './memory-session-key-expression';
 import { noDisabledNodes } from './no-disabled-nodes';
@ -48,6 +49,7 @@ export const DETERMINISTIC_CHECKS: BinaryCheck[] = [
 	noInvalidFromAi,
 	toolsHaveParameters,
 	noUnreachableNodes,
+	inboundTriggerAuthDefaults,
 	validNodeConfig,
 ];

--- a/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/compare-pairwise.ts
@ -0,0 +1,960 @@
+// ---------------------------------------------------------------------------
+// Side-by-side comparison report for two pairwise eval runs
+// (typically: ai-workflow-builder.ee vs instance-ai).
+//
+// Usage:
+//   pnpm tsx evaluations/cli/compare-pairwise.ts \
+//     --ee-dir   ../ai-workflow-builder.ee/evaluations/.output/pairwise/<ts> \
+//     --ia-dir   .output/pairwise/<ts> \
+//     --out      .output/pairwise/comparison.html
+//
+// Both directories must contain a `summary.json`. Per-example data layouts
+// differ between the builders, so the loaders below normalize into a shared
+// `BuilderRecord` shape, joined by prompt text.
+// ---------------------------------------------------------------------------
+
+import { jsonParse } from 'n8n-workflow';
+import { promises as fs } from 'node:fs';
+import path from 'node:path';
+
+// ---------------------------------------------------------------------------
+// Shared shape after normalization
+// ---------------------------------------------------------------------------
+
+export interface FeedbackEntry {
+	metric: string;
+	score: number;
+	kind?: string;
+	comment?: string;
+}
+
+export interface BuilderRecord {
+	prompt: string;
+	/** Stable id for the example. For IA, the LangSmith dataset example id;
+	 * for EE, the example directory name (e.g. `example-000-ab12cd`). */
+	exampleId?: string;
+	dos?: string;
+	donts?: string;
+	workflow: unknown;
+	durationMs: number;
+	success: boolean;
+	errorClass?: string;
+	errorMessage?: string;
+	feedback: FeedbackEntry[];
+	tokenInput?: number;
+	tokenOutput?: number;
+}
+
+interface BuilderSummary {
+	label: string;
+	dataset?: string;
+	judgeModel?: string;
+	numJudges?: number;
+	startedAt?: string;
+	finishedAt?: string;
+	totals: {
+		examples: number;
+		buildSuccess: number;
+		buildFailures: Record<string, number>;
+		primaryPassRate: number;
+		avgDiagnostic: number;
+		avgDurationMs: number;
+	};
+}
+
+interface BuilderRun {
+	summary: BuilderSummary;
+	records: BuilderRecord[];
+}
+
+// ---------------------------------------------------------------------------
+// Instance AI loader (writes results.jsonl + workflows/<id>.json + summary.json)
+// ---------------------------------------------------------------------------
+
+interface IAResultRecord {
+	exampleId: string;
+	iteration: number;
+	prompt: string;
+	dos?: string;
+	donts?: string;
+	workflow: unknown;
+	build: {
+		success: boolean;
+		errorClass?: string;
+		errorMessage?: string;
+		durationMs: number;
+		tokenUsage?: { input?: number; output?: number };
+	};
+	feedback: Array<{ metric: string; score: number; kind?: string; comment?: string }>;
+}
+
+interface IASummary {
+	builder: string;
+	dataset: string;
+	judgeModel: string;
+	numJudges: number;
+	startedAt: string;
+	finishedAt: string;
+	totals: {
+		examples: number;
+		buildSuccess: number;
+		buildFailures: Record<string, number>;
+		primaryPassRate: number;
+		avgDiagnostic: number;
+	};
+}
+
+async function loadInstanceAiRun(dir: string): Promise<BuilderRun> {
+	const summaryPath = path.join(dir, 'summary.json');
+	const resultsPath = path.join(dir, 'results.jsonl');
+	const [summaryRaw, resultsRaw] = await Promise.all([
+		fs.readFile(summaryPath, 'utf8'),
+		fs.readFile(resultsPath, 'utf8'),
+	]);
+	const summary = jsonParse<IASummary>(summaryRaw, {
+		errorMessage: `Failed to parse ${summaryPath}`,
+	});
+	const records = resultsRaw
+		.split('\n')
+		.filter((line) => line.trim().length > 0)
+		.map((line) =>
+			jsonParse<IAResultRecord>(line, {
+				errorMessage: `Failed to parse a line in ${resultsPath}`,
+			}),
+		)
+		// Use only iteration 1 for a fair 1:1 comparison.
+		.filter((r) => r.iteration === 1);
+
+	const normalized: BuilderRecord[] = records.map((r) => ({
+		prompt: r.prompt,
+		exampleId: r.exampleId,
+		dos: r.dos,
+		donts: r.donts,
+		workflow: r.workflow,
+		durationMs: r.build.durationMs,
+		success: r.build.success,
+		errorClass: r.build.errorClass,
+		errorMessage: r.build.errorMessage,
+		feedback: r.feedback,
+		tokenInput: r.build.tokenUsage?.input,
+		tokenOutput: r.build.tokenUsage?.output,
+	}));
+
+	const avgDuration =
+		normalized.length === 0
+			? 0
+			: normalized.reduce((sum, r) => sum + r.durationMs, 0) / normalized.length;
+
+	// Recompute totals from the filtered set so the comparison summary stays
+	// consistent with the rendered records (1:1 across builders, iter 1 only).
+	const buildSuccess = normalized.filter((r) => r.success).length;
+	const buildFailures: Record<string, number> = {};
+	for (const r of normalized) {
+		if (r.success) continue;
+		const key = r.errorClass ?? 'error';
+		buildFailures[key] = (buildFailures[key] ?? 0) + 1;
+	}
+	const primaryPasses = normalized.filter(
+		(r) => findScore(r.feedback, 'pairwise_primary') === 1,
+	).length;
+	const primaryPassRate = normalized.length === 0 ? 0 : primaryPasses / normalized.length;
+	const diagnosticScores = normalized
+		.map((r) => findScore(r.feedback, 'pairwise_diagnostic'))
+		.filter((v): v is number => v !== undefined && Number.isFinite(v));
+	const avgDiagnostic =
+		diagnosticScores.length === 0
+			? 0
+			: diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length;
+
+	return {
+		summary: {
+			label: `${summary.builder} (instance-ai)`,
+			dataset: summary.dataset,
+			judgeModel: summary.judgeModel,
+			numJudges: summary.numJudges,
+			startedAt: summary.startedAt,
+			finishedAt: summary.finishedAt,
+			totals: {
+				examples: normalized.length,
+				buildSuccess,
+				buildFailures,
+				primaryPassRate,
+				avgDiagnostic,
+				avgDurationMs: avgDuration,
+			},
+		},
+		records: normalized,
+	};
+}
+
+// ---------------------------------------------------------------------------
+// EE loader (writes example-NNN-HASH/{prompt.txt, workflow.json, feedback.json}
+// + summary.json with an aggregate `evaluatorAverages`).
+// ---------------------------------------------------------------------------
+
+interface EEFeedbackJson {
+	index: number;
+	status: string;
+	durationMs: number;
+	generationDurationMs?: number;
+	generationInputTokens?: number;
+	generationOutputTokens?: number;
+	score?: number;
+	evaluators?: Array<{
+		name: string;
+		feedback: Array<{
+			key: string;
+			metric: string;
+			score: number;
+			kind?: string;
+			comment?: string;
+		}>;
+		averageScore?: number;
+	}>;
+	allFeedback?: Array<{
+		evaluator: string;
+		metric: string;
+		score: number;
+		kind?: string;
+		comment?: string;
+	}>;
+}
+
+interface EESummaryJson {
+	timestamp?: string;
+	totalExamples: number;
+	passed: number;
+	failed: number;
+	errors: number;
+	passRate: number;
+	averageScore?: number;
+	evaluatorAverages?: Record<string, number>;
+	totalDurationMs?: number;
+}
+
+async function loadEERun(dir: string): Promise<BuilderRun> {
+	const summaryPath = path.join(dir, 'summary.json');
+	const summaryRaw = await readOptional(summaryPath);
+	const summary = summaryRaw
+		? jsonParse<EESummaryJson>(summaryRaw, { errorMessage: `Failed to parse ${summaryPath}` })
+		: null;
+
+	const entries = await fs.readdir(dir, { withFileTypes: true });
+	const exampleDirs = entries
+		.filter((e) => e.isDirectory() && e.name.startsWith('example-'))
+		.map((e) => path.join(dir, e.name));
+
+	const records: BuilderRecord[] = [];
+	for (const exampleDir of exampleDirs) {
+		const promptPath = path.join(exampleDir, 'prompt.txt');
+		const workflowPath = path.join(exampleDir, 'workflow.json');
+		const feedbackPath = path.join(exampleDir, 'feedback.json');
+		const errorPath = path.join(exampleDir, 'error.txt');
+
+		const prompt = await readOptional(promptPath);
+		if (!prompt) continue;
+
+		const [workflowRaw, feedbackRaw, errorRaw] = await Promise.all([
+			readOptional(workflowPath),
+			readOptional(feedbackPath),
+			readOptional(errorPath),
+		]);
+
+		const workflow = workflowRaw
+			? jsonParse<unknown>(workflowRaw, { errorMessage: `Failed to parse ${workflowPath}` })
+			: null;
+		const feedbackJson = feedbackRaw
+			? jsonParse<EEFeedbackJson>(feedbackRaw, {
+					errorMessage: `Failed to parse ${feedbackPath}`,
+				})
+			: null;
+		const exampleId = path.basename(exampleDir);
+
+		const feedback: FeedbackEntry[] = [];
+		// Prefer `allFeedback` (flat list, matches IA shape), fall back to nested evaluators.
+		if (feedbackJson?.allFeedback) {
+			for (const f of feedbackJson.allFeedback) {
+				feedback.push({ metric: f.metric, score: f.score, kind: f.kind, comment: f.comment });
+			}
+		} else if (feedbackJson?.evaluators) {
+			for (const ev of feedbackJson.evaluators) {
+				for (const f of ev.feedback) {
+					feedback.push({ metric: f.metric, score: f.score, kind: f.kind, comment: f.comment });
+				}
+			}
+		}
+
+		// EE status: 'pass' | 'fail' | 'error'. Only 'error' means the workflow
+		// was never built — 'fail' means it was built but the eval marked it
+		// non-passing. We separate those: `success` = workflow exists.
+		const status = feedbackJson?.status ?? 'unknown';
+		const success = status !== 'error' && workflow !== null;
+		const errorClass = status === 'error' ? 'error' : success ? undefined : status;
+
+		records.push({
+			prompt,
+			exampleId,
+			dos: extractDosFromPrompt(prompt) ?? undefined,
+			donts: extractDontsFromPrompt(prompt) ?? undefined,
+			workflow,
+			durationMs: feedbackJson?.durationMs ?? 0,
+			success,
+			errorClass,
+			errorMessage: errorRaw ?? undefined,
+			feedback,
+			tokenInput: feedbackJson?.generationInputTokens,
+			tokenOutput: feedbackJson?.generationOutputTokens,
+		});
+	}
+
+	const avgDuration =
+		records.length === 0 ? 0 : records.reduce((sum, r) => sum + r.durationMs, 0) / records.length;
+	const primaryPassCount = records.filter(
+		(r) => findScore(r.feedback, 'pairwise_primary') === 1,
+	).length;
+	const diagnosticScores = records
+		.map((r) => findScore(r.feedback, 'pairwise_diagnostic'))
+		.filter((v): v is number => v !== undefined && Number.isFinite(v));
+	const avgDiagnostic =
+		diagnosticScores.length === 0
+			? 0
+			: diagnosticScores.reduce((a, b) => a + b, 0) / diagnosticScores.length;
+
+	const buildFailures: Record<string, number> = {};
+	for (const r of records) {
+		if (!r.success) {
+			const key = r.errorClass ?? 'error';
+			buildFailures[key] = (buildFailures[key] ?? 0) + 1;
+		}
+	}
+
+	const errorCount = records.filter((r) => !r.success).length;
+	const buildSuccessCount = records.length - errorCount;
+
+	return {
+		summary: {
+			label: 'Code Builder',
+			startedAt: summary?.timestamp,
+			totals: {
+				examples: summary?.totalExamples ?? records.length,
+				buildSuccess: summary ? summary.totalExamples - summary.errors : buildSuccessCount,
+				buildFailures,
+				primaryPassRate: records.length === 0 ? 0 : primaryPassCount / records.length,
+				avgDiagnostic,
+				avgDurationMs: avgDuration,
+			},
+		},
+		records,
+	};
+}
+
+async function readOptional(filePath: string): Promise<string | null> {
+	try {
+		return await fs.readFile(filePath, 'utf8');
+	} catch {
+		return null;
+	}
+}
+
+// EE prompts in `notion-pairwise-workflows` don't carry dos/donts text — those
+// are LangSmith inputs, not in prompt.txt. Return undefined so the IA criteria
+// (which we have) drive the rendering. These stubs are placeholders in case we
+// later hand-encode criteria into prompt.txt.
+function extractDosFromPrompt(_prompt: string): string | null {
+	return null;
+}
+function extractDontsFromPrompt(_prompt: string): string | null {
+	return null;
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function findScore(feedback: FeedbackEntry[], metric: string): number | undefined {
+	return feedback.find((f) => f.metric === metric)?.score;
+}
+
+function escapeHtml(input: string): string {
+	return input
+		.replace(/&/g, '&amp;')
+		.replace(/</g, '&lt;')
+		.replace(/>/g, '&gt;')
+		.replace(/"/g, '&quot;')
+		.replace(/'/g, '&#39;');
+}
+
+function escapeAttr(input: string): string {
+	return input.replace(/&/g, '&amp;').replace(/'/g, '&apos;').replace(/"/g, '&quot;');
+}
+
+function formatDuration(ms: number): string {
+	if (ms < 1000) return `${ms}ms`;
+	if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`;
+	const minutes = Math.floor(ms / 60_000);
+	const seconds = Math.floor((ms % 60_000) / 1000);
+	return `${minutes}m${seconds.toString().padStart(2, '0')}s`;
+}
+
+function pct(n: number): string {
+	return `${(n * 100).toFixed(1)}%`;
+}
+
+// ---------------------------------------------------------------------------
+// Pairing
+// ---------------------------------------------------------------------------
+
+export interface ComparisonRow {
+	prompt: string;
+	dos?: string;
+	donts?: string;
+	ee?: BuilderRecord;
+	ia?: BuilderRecord;
+	verdict: 'both-pass' | 'both-fail' | 'ee-only' | 'ia-only' | 'neither';
+}
+
+/**
+ * Normalize prompt text used as the join key. EE and IA generate dirs/IDs
+ * via different schemes, so we have to match by prompt. Trim + collapse
+ * whitespace so trivial drift (CRLF, trailing space, indented blocks)
+ * doesn't silently un-pair otherwise-identical examples.
+ */
+export function promptJoinKey(prompt: string): string {
+	return prompt.replace(/\s+/g, ' ').trim();
+}
+
+export function pairRecords(ee: BuilderRecord[], ia: BuilderRecord[]): ComparisonRow[] {
+	const byKey = new Map<string, ComparisonRow>();
+	const ensure = (prompt: string): ComparisonRow => {
+		const key = promptJoinKey(prompt);
+		const existing = byKey.get(key);
+		if (existing) return existing;
+		const created: ComparisonRow = { prompt, verdict: 'neither' };
+		byKey.set(key, created);
+		return created;
+	};
+
+	for (const r of ee) {
+		const row = ensure(r.prompt);
+		row.ee = r;
+	}
+	for (const r of ia) {
+		const row = ensure(r.prompt);
+		row.ia = r;
+		// IA carries the dos/donts text, prefer it as the source of truth.
+		if (r.dos) row.dos = r.dos;
+		if (r.donts) row.donts = r.donts;
+	}
+
+	// Compute verdict for each row.
+	for (const row of byKey.values()) {
+		const eePass = row.ee && row.ee.success && findScore(row.ee.feedback, 'pairwise_primary') === 1;
+		const iaPass = row.ia && row.ia.success && findScore(row.ia.feedback, 'pairwise_primary') === 1;
+		row.verdict =
+			eePass && iaPass ? 'both-pass' : eePass ? 'ee-only' : iaPass ? 'ia-only' : 'both-fail';
+	}
+
+	const order: Record<ComparisonRow['verdict'], number> = {
+		'ee-only': 0,
+		'ia-only': 1,
+		'both-fail': 2,
+		'both-pass': 3,
+		neither: 4,
+	};
+	return [...byKey.values()].sort((a, b) => {
+		const ord = order[a.verdict] - order[b.verdict];
+		if (ord !== 0) return ord;
+		return a.prompt.localeCompare(b.prompt);
+	});
+}
+
+// ---------------------------------------------------------------------------
+// Rendering
+// ---------------------------------------------------------------------------
+
+function renderCriteriaList(raw: string | undefined, kind: 'do' | 'dont'): string {
+	if (!raw) return '';
+	const lines = raw
+		.split('\n')
+		.map((line) => line.trim())
+		.filter((line) => line.length > 0);
+	if (lines.length === 0) return '';
+	const items = lines.map((line) => `<li>${escapeHtml(line)}</li>`).join('');
+	const label = kind === 'do' ? 'Do' : "Don't";
+	return `<div class="criteria ${kind}"><h4>${label}</h4><ul>${items}</ul></div>`;
+}
+
+function renderWorkflow(workflow: unknown): string {
+	if (!workflow) {
+		return '<div class="no-workflow">No workflow built.</div>';
+	}
+	const json = JSON.stringify(workflow);
+	return `<n8n-demo workflow="${escapeAttr(json)}" frame="true" clicktointeract="true" collapseformobile="true"></n8n-demo>`;
+}
+
+function renderJudgeRows(feedback: FeedbackEntry[]): string {
+	const judges = feedback.filter((f) => /^judge\d+$/.test(f.metric));
+	if (judges.length === 0) return '';
+	const rows = judges
+		.map((j) => {
+			const cls = j.score === 1 ? 'judge-pass' : 'judge-fail';
+			const comment = j.comment ? escapeHtml(j.comment) : '<em>no violations</em>';
+			return `<tr><td class="${cls}">${escapeHtml(j.metric)}</td><td>${j.score}</td><td>${comment}</td></tr>`;
+		})
+		.join('');
+	return `<table class="judges"><thead><tr><th>Judge</th><th>Pass</th><th>Notes</th></tr></thead><tbody>${rows}</tbody></table>`;
+}
+
+interface BuilderHeadline {
+	statusBadge: string;
+	statusKind: 'pass' | 'fail' | 'missing';
+	metaText: string; // duration · diagnostic · token info
+}
+
+function buildHeadline(record: BuilderRecord | undefined): BuilderHeadline {
+	if (!record) {
+		return {
+			statusBadge: '<span class="status status-missing">N/A</span>',
+			statusKind: 'missing',
+			metaText: '—',
+		};
+	}
+	const primary = findScore(record.feedback, 'pairwise_primary');
+	const diagnostic = findScore(record.feedback, 'pairwise_diagnostic');
+
+	const statusBadge = !record.success
+		? `<span class="status status-fail">BUILD ${escapeHtml(record.errorClass ?? 'error').toUpperCase()}</span>`
+		: primary === 1
+			? '<span class="status status-pass">PASS</span>'
+			: '<span class="status status-fail">FAIL</span>';
+	const statusKind: BuilderHeadline['statusKind'] = !record.success
+		? 'fail'
+		: primary === 1
+			? 'pass'
+			: 'fail';
+
+	const metaParts: string[] = [formatDuration(record.durationMs)];
+	if (diagnostic !== undefined) metaParts.push(`diag ${diagnostic.toFixed(2)}`);
+	return { statusBadge, statusKind, metaText: metaParts.join(' · ') };
+}
+
+function renderBuilderColumn(label: string, record: BuilderRecord | undefined): string {
+	if (!record) {
+		return `<div class="builder-col missing"><div class="builder-label">${escapeHtml(label)}</div><div class="missing-msg">No record for this prompt.</div></div>`;
+	}
+
+	const primary = findScore(record.feedback, 'pairwise_primary');
+	const diagnostic = findScore(record.feedback, 'pairwise_diagnostic');
+	const totalPasses = findScore(record.feedback, 'pairwise_total_passes');
+	const totalViolations = findScore(record.feedback, 'pairwise_total_violations');
+
+	const statusBadge = !record.success
+		? `<span class="status status-fail">BUILD ${escapeHtml(record.errorClass ?? 'error').toUpperCase()}</span>`
+		: primary === 1
+			? '<span class="status status-pass">PASS</span>'
+			: '<span class="status status-fail">FAIL</span>';
+
+	const metaParts: string[] = [`<span>${formatDuration(record.durationMs)}</span>`];
+	if (diagnostic !== undefined) {
+		metaParts.push(`<span>diag ${diagnostic.toFixed(2)}</span>`);
+	}
+	if (totalPasses !== undefined && totalViolations !== undefined) {
+		metaParts.push(`<span>${totalPasses}p / ${totalViolations}v</span>`);
+	}
+	if (record.tokenInput !== undefined && record.tokenOutput !== undefined) {
+		metaParts.push(`<span>${record.tokenInput}+${record.tokenOutput} tok</span>`);
+	}
+
+	const errorBlock = record.errorMessage
+		? `<div class="error">${escapeHtml(record.errorMessage)}</div>`
+		: '';
+
+	const idLine = record.exampleId
+		? `<div class="builder-id" title="${escapeAttr(record.exampleId)}">${escapeHtml(record.exampleId)}</div>`
+		: '';
+
+	return `<div class="builder-col">
+  <div class="builder-header">
+    <div class="builder-label">${escapeHtml(label)}</div>
+    ${statusBadge}
+  </div>
+  ${idLine}
+  <div class="builder-meta">${metaParts.join(' · ')}</div>
+  ${errorBlock}
+  <div class="workflow-wrap">${renderWorkflow(record.workflow)}</div>
+  ${renderJudgeRows(record.feedback)}
+</div>`;
+}
+
+function renderRow(row: ComparisonRow, index: number): string {
+	const verdictLabel: Record<ComparisonRow['verdict'], string> = {
+		'both-pass': 'BOTH PASS',
+		'both-fail': 'BOTH FAIL',
+		'ee-only': 'CODE ONLY',
+		'ia-only': 'IA ONLY',
+		neither: '—',
+	};
+	const verdictCls: Record<ComparisonRow['verdict'], string> = {
+		'both-pass': 'verdict-both-pass',
+		'both-fail': 'verdict-both-fail',
+		'ee-only': 'verdict-ee-only',
+		'ia-only': 'verdict-ia-only',
+		neither: 'verdict-neither',
+	};
+
+	const eeHead = buildHeadline(row.ee);
+	const iaHead = buildHeadline(row.ia);
+	const promptPreview = row.prompt.slice(0, 110) + (row.prompt.length > 110 ? '…' : '');
+
+	const builderChip = (label: string, head: BuilderHeadline): string =>
+		`<span class="builder-chip chip-${head.statusKind}">
+      <span class="chip-label">${escapeHtml(label)}</span>
+      ${head.statusBadge}
+      <span class="chip-meta">${escapeHtml(head.metaText)}</span>
+    </span>`;
+
+	const ids: string[] = [];
+	if (row.ia?.exampleId) ids.push(row.ia.exampleId);
+	if (row.ee?.exampleId && row.ee.exampleId !== row.ia?.exampleId) ids.push(row.ee.exampleId);
+	const idText = ids.join(' / ');
+	const idChip = `<span class="example-id" title="${escapeAttr(idText)}">${escapeHtml(idText)}</span>`;
+
+	// Heavy content (workflow previews + judge tables) is wrapped in a <template>
+	// so the n8n-demo web component is NOT instantiated until the user expands
+	// the row. The lazy loader script in the document head does the swap.
+	return `<details class="row ${verdictCls[row.verdict]}" id="row-${index}">
+  <summary>
+    <span class="verdict">${verdictLabel[row.verdict]}</span>
+    ${idChip}
+    <span class="prompt-preview">${escapeHtml(promptPreview)}</span>
+    <span class="builder-chips">
+      ${builderChip('Code', eeHead)}
+      ${builderChip('IA', iaHead)}
+    </span>
+  </summary>
+  <div class="body">
+    <section class="prompt-block">
+      <h3>Prompt</h3>
+      <pre>${escapeHtml(row.prompt)}</pre>
+    </section>
+    <section class="criteria-row">
+      ${renderCriteriaList(row.dos, 'do')}
+      ${renderCriteriaList(row.donts, 'dont')}
+    </section>
+    <div class="lazy-slot" data-loaded="false">
+      <template>
+        <div class="builder-grid">
+          ${renderBuilderColumn('Code Builder', row.ee)}
+          ${renderBuilderColumn('instance-ai', row.ia)}
+        </div>
+      </template>
+      <div class="lazy-placeholder">Click to load workflow previews and judge details…</div>
+    </div>
+  </div>
+</details>`;
+}
+
+function renderSummaryCard(
+	label: string,
+	summary: BuilderSummary,
+	totalRecords: number,
+	records: BuilderRecord[],
+): string {
+	const failureBits = Object.entries(summary.totals.buildFailures)
+		.map(([k, v]) => `${k}: ${v}`)
+		.join(', ');
+	const primaryPasses = records.filter(
+		(r) => findScore(r.feedback, 'pairwise_primary') === 1,
+	).length;
+	const overallPassRate = totalRecords === 0 ? 0 : primaryPasses / totalRecords;
+	return `<div class="summary-card">
+  <h2>${escapeHtml(label)}</h2>
+  ${summary.dataset ? `<div class="meta-row">Dataset: <code>${escapeHtml(summary.dataset)}</code></div>` : ''}
+  ${summary.judgeModel ? `<div class="meta-row">Judge: ${escapeHtml(summary.judgeModel)} × ${summary.numJudges ?? 1}</div>` : ''}
+  ${summary.startedAt ? `<div class="meta-row">Started: ${escapeHtml(summary.startedAt)}</div>` : ''}
+  <div class="metric"><strong>${pct(overallPassRate)}</strong><span>primary pass</span></div>
+  <div class="metric"><strong>${summary.totals.avgDiagnostic.toFixed(2)}</strong><span>avg diagnostic</span></div>
+  <div class="metric"><strong>${formatDuration(summary.totals.avgDurationMs)}</strong><span>avg build time</span></div>
+  <div class="metric"><strong>${summary.totals.buildSuccess}/${totalRecords}</strong><span>built ok</span></div>
+  ${failureBits ? `<div class="meta-row failures">Failures: ${escapeHtml(failureBits)}</div>` : ''}
+</div>`;
+}
+
+function renderMetricsNote(): string {
+	return `<aside class="metrics-note">
+  <strong>Metric definitions:</strong>
+  <span><b>Primary pass</b> — workflow passes only if a majority of LLM judges (2 of 3) find zero "don't" violations. Computed over all prompt attempts; build failures count as fail.</span>
+  <span><b>Average diagnostic</b> — mean fraction of criteria (dos + don'ts) satisfied across the dataset, averaged across judges. Range 0–1; gives partial credit.</span>
+  <span><b>Average build time</b> — averaged across all attempts including failures, so build timeouts (20-min cap) inflate this number.</span>
+  <span><b>Verdicts</b> compare per-prompt primary pass between the two builders.</span>
+</aside>`;
+}
+
+function renderVerdictTotals(rows: ComparisonRow[]): string {
+	const counts: Record<ComparisonRow['verdict'], number> = {
+		'both-pass': 0,
+		'both-fail': 0,
+		'ee-only': 0,
+		'ia-only': 0,
+		neither: 0,
+	};
+	for (const r of rows) counts[r.verdict]++;
+
+	const total = rows.length;
+	const card = (label: string, n: number, cls: string): string =>
+		`<div class="verdict-card ${cls}"><strong>${n}</strong><span>${escapeHtml(label)}</span><em>${total === 0 ? '0%' : pct(n / total)}</em></div>`;
+
+	return `<div class="verdict-grid">
+  ${card('Both pass', counts['both-pass'], 'verdict-both-pass')}
+  ${card('Code Builder only passes', counts['ee-only'], 'verdict-ee-only')}
+  ${card('IA only passes', counts['ia-only'], 'verdict-ia-only')}
+  ${card('Both fail', counts['both-fail'], 'verdict-both-fail')}
+</div>`;
+}
+
+function renderDocument(ee: BuilderRun, ia: BuilderRun, rows: ComparisonRow[]): string {
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8" />
+<title>Pairwise Eval Comparison — Code Builder vs Instance AI</title>
+<script defer src="https://cdn.jsdelivr.net/npm/@webcomponents/webcomponentsjs@2.0.0/webcomponents-loader.js"></script>
+<script defer src="https://www.unpkg.com/lit@2.0.0-rc.2/polyfill-support.js"></script>
+<script type="module" src="https://cdn.jsdelivr.net/npm/@n8n_io/n8n-demo-component/n8n-demo.bundled.js"></script>
+<style>
+  :root {
+    font-family: ui-sans-serif, system-ui, -apple-system, sans-serif;
+    color-scheme: dark;
+    --bg: #0d1117;
+    --fg: #e6edf3;
+    --muted: #8b949e;
+    --border: #30363d;
+    --card: #161b22;
+    --subtle: #1c2129;
+    --pass: #3fb950;
+    --fail: #f85149;
+    --partial: #d29922;
+    --accent: #7c8cff;
+    --ee: #818cf8;
+    --ia: #2dd4bf;
+  }
+  body { margin: 0; background: var(--bg); color: var(--fg); }
+  header.top { padding: 16px 20px; background: var(--card); border-bottom: 1px solid var(--border); }
+  header.top h1 { margin: 0 0 6px 0; font-size: 18px; }
+  header.top .subhead { color: var(--muted); font-size: 13px; }
+  main { padding: 20px; max-width: 1600px; margin: 0 auto; display: flex; flex-direction: column; gap: 24px; }
+  .summary-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+  .summary-card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 16px; display: flex; flex-direction: column; gap: 6px; }
+  .summary-card h2 { margin: 0 0 4px 0; font-size: 15px; }
+  .summary-card .meta-row { font-size: 12px; color: var(--muted); }
+  .summary-card .meta-row code { font-family: ui-monospace, monospace; font-size: 11px; background: var(--subtle); padding: 1px 4px; border-radius: 3px; }
+  .summary-card .metric { display: flex; justify-content: space-between; align-items: baseline; margin-top: 4px; font-size: 13px; }
+  .summary-card .metric strong { font-size: 18px; color: var(--accent); }
+  .summary-card .metric span { color: var(--muted); }
+  .summary-card .meta-row.failures { color: var(--fail); margin-top: 6px; }
+  .verdict-grid { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; }
+  .metrics-note {
+    background: var(--card);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 12px 16px;
+    font-size: 12px;
+    color: var(--muted);
+    display: flex;
+    flex-direction: column;
+    gap: 4px;
+  }
+  .metrics-note strong { color: var(--fg); font-size: 12px; }
+  .metrics-note b { color: var(--fg); font-weight: 600; }
+  .verdict-card { background: var(--card); border: 1px solid var(--border); border-radius: 8px; padding: 14px 16px; display: flex; flex-direction: column; gap: 4px; align-items: flex-start; }
+  .verdict-card strong { font-size: 26px; font-weight: 700; }
+  .verdict-card span { color: var(--muted); font-size: 12px; }
+  .verdict-card em { color: var(--muted); font-size: 11px; font-style: normal; }
+  .verdict-both-pass strong { color: var(--pass); }
+  .verdict-both-fail strong { color: var(--fail); }
+  .verdict-ee-only strong { color: var(--ee); }
+  .verdict-ia-only strong { color: var(--ia); }
+  .rows { background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; }
+  details.row { border-bottom: 1px solid var(--border); }
+  details.row:last-child { border-bottom: none; }
+  details.row > summary {
+    list-style: none;
+    cursor: pointer;
+    padding: 10px 16px;
+    display: grid;
+    grid-template-columns: 110px minmax(0, auto) minmax(0, 1fr) auto;
+    gap: 16px;
+    align-items: center;
+    font-size: 13px;
+  }
+  details.row > summary > .example-id {
+    font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+    font-size: 11px;
+    color: var(--muted);
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    max-width: 220px;
+  }
+  details.row > summary:hover { background: var(--subtle); }
+  details.row[open] > summary { background: var(--subtle); border-bottom: 1px solid var(--border); }
+  details.row > summary::-webkit-details-marker { display: none; }
+  details.row > summary .verdict {
+    font-size: 11px;
+    font-weight: 700;
+    letter-spacing: 0.04em;
+    padding: 3px 8px;
+    border-radius: 3px;
+    text-align: center;
+  }
+  details.row.verdict-both-pass > summary .verdict { background: rgba(63,185,80,0.18); color: var(--pass); }
+  details.row.verdict-both-fail > summary .verdict { background: rgba(248,81,73,0.18); color: var(--fail); }
+  details.row.verdict-ee-only > summary .verdict { background: rgba(129,140,248,0.2); color: var(--ee); }
+  details.row.verdict-ia-only > summary .verdict { background: rgba(45,212,191,0.18); color: var(--ia); }
+  details.row > summary .prompt-preview { color: var(--fg); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+  details.row > summary .builder-chips { display: flex; gap: 8px; white-space: nowrap; }
+  .builder-chip {
+    display: inline-flex;
+    align-items: center;
+    gap: 6px;
+    padding: 3px 8px;
+    border-radius: 4px;
+    font-size: 11px;
+    border: 1px solid var(--border);
+    background: var(--card);
+  }
+  .builder-chip.chip-pass { border-color: rgba(63,185,80,0.4); background: rgba(63,185,80,0.08); }
+  .builder-chip.chip-fail { border-color: rgba(248,81,73,0.35); background: rgba(248,81,73,0.08); }
+  .builder-chip.chip-missing { border-color: var(--border); background: var(--subtle); }
+  .builder-chip .chip-label { font-weight: 700; color: var(--muted); letter-spacing: 0.04em; }
+  .builder-chip .chip-meta { color: var(--muted); }
+  .lazy-slot { margin-top: 14px; }
+  .lazy-placeholder { padding: 18px; text-align: center; color: var(--muted); font-size: 12px; border: 1px dashed var(--border); border-radius: 4px; background: var(--subtle); }
+  details.row > .body { padding: 16px; background: var(--subtle); border-top: 1px solid var(--border); }
+  details.row > .body h3 { margin: 0 0 6px 0; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em; color: var(--muted); }
+  details.row pre { background: var(--bg); border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; font-size: 12px; white-space: pre-wrap; max-height: 200px; overflow-y: auto; color: var(--fg); }
+  .criteria-row { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin-top: 12px; }
+  .criteria { border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; background: var(--card); }
+  .criteria h4 { margin: 0 0 4px 0; font-size: 11px; text-transform: uppercase; letter-spacing: 0.05em; }
+  .criteria.do h4 { color: var(--pass); }
+  .criteria.dont h4 { color: var(--fail); }
+  .criteria ul { margin: 0; padding-left: 18px; font-size: 12px; }
+  .builder-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin-top: 14px; }
+  .builder-col { background: var(--card); border: 1px solid var(--border); border-radius: 6px; padding: 12px; display: flex; flex-direction: column; gap: 8px; }
+  .builder-col.missing { background: var(--subtle); }
+  .builder-col .missing-msg { color: var(--muted); font-style: italic; font-size: 12px; }
+  .builder-header { display: flex; justify-content: space-between; align-items: center; }
+  .builder-label { font-weight: 600; font-size: 13px; }
+  .status { font-size: 11px; font-weight: 700; padding: 3px 8px; border-radius: 3px; letter-spacing: 0.04em; }
+  .status-pass { background: rgba(63,185,80,0.2); color: var(--pass); }
+  .status-fail { background: rgba(248,81,73,0.2); color: var(--fail); }
+  .builder-meta { font-size: 11px; color: var(--muted); display: flex; gap: 8px; flex-wrap: wrap; }
+  .builder-id { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; font-size: 11px; color: var(--muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+  .error { padding: 8px 10px; background: rgba(248,81,73,0.12); color: var(--fail); border-radius: 4px; font-size: 11px; white-space: pre-wrap; max-height: 120px; overflow-y: auto; }
+  .workflow-wrap { display: flex; }
+  n8n-demo { display: block; width: 100%; height: 320px; border: 1px solid var(--border); border-radius: 4px; background: #fff; color-scheme: light; }
+  .no-workflow { padding: 30px; text-align: center; color: var(--muted); font-size: 12px; border: 1px dashed var(--border); border-radius: 4px; flex: 1; }
+  table.judges { width: 100%; border-collapse: collapse; font-size: 11px; background: var(--card); border: 1px solid var(--border); border-radius: 4px; overflow: hidden; }
+  table.judges th, table.judges td { padding: 5px 8px; text-align: left; border-bottom: 1px solid var(--border); vertical-align: top; }
+  table.judges tr:last-child td { border-bottom: none; }
+  table.judges td.judge-pass { color: var(--pass); font-weight: 600; }
+  table.judges td.judge-fail { color: var(--fail); font-weight: 600; }
+</style>
+</head>
+<body>
+<header class="top">
+  <h1>Pairwise Eval Comparison — Code Builder vs Instance AI</h1>
+  <div class="subhead">${rows.length} prompt${rows.length === 1 ? '' : 's'} compared. Rows are ordered: Code-only wins, IA-only wins, both fail, both pass.</div>
+</header>
+<main>
+  <section class="summary-row">
+    ${renderSummaryCard('Code Builder', ee.summary, ee.records.length, ee.records)}
+    ${renderSummaryCard('instance-ai', ia.summary, ia.records.length, ia.records)}
+  </section>
+  ${renderVerdictTotals(rows)}
+  ${renderMetricsNote()}
+  <section class="rows">
+    ${rows.map((r, i) => renderRow(r, i)).join('\n')}
+  </section>
+</main>
+<script>
+  // Lazy-load heavy preview content (n8n-demo + judge tables) on first expand.
+  // Each row contains <template> with the workflow previews inside a
+  // .lazy-slot[data-loaded="false"] div. On the first toggle-open we move the
+  // template's content into the live DOM so the n8n-demo web component is
+  // only constructed for rows the user actually reads.
+  document.querySelectorAll('details.row').forEach((details) => {
+    details.addEventListener('toggle', () => {
+      if (!details.open) return;
+      const slot = details.querySelector('.lazy-slot[data-loaded="false"]');
+      if (!slot) return;
+      const template = slot.querySelector('template');
+      const placeholder = slot.querySelector('.lazy-placeholder');
+      if (template) {
+        slot.appendChild(template.content.cloneNode(true));
+        template.remove();
+      }
+      if (placeholder) placeholder.remove();
+      slot.dataset.loaded = 'true';
+    }, { once: true });
+  });
+</script>
+</body>
+</html>`;
+}
+
+// ---------------------------------------------------------------------------
+// CLI
+// ---------------------------------------------------------------------------
+
+interface CliArgs {
+	eeDir: string;
+	iaDir: string;
+	out: string;
+}
+
+function parseArgs(argv: string[]): CliArgs {
+	const get = (flag: string): string | undefined => {
+		const idx = argv.indexOf(flag);
+		if (idx === -1) return undefined;
+		const value = argv[idx + 1];
+		return value && !value.startsWith('--') ? value : undefined;
+	};
+	const eeDir = get('--ee-dir');
+	const iaDir = get('--ia-dir');
+	if (!eeDir || !iaDir) {
+		throw new Error(
+			'Usage: tsx evaluations/cli/compare-pairwise.ts --ee-dir <path> --ia-dir <path> [--out <path>]',
+		);
+	}
+	const defaultOut = path.join(path.dirname(path.resolve(iaDir)), 'comparison.html');
+	const out = path.resolve(get('--out') ?? defaultOut);
+	return { eeDir: path.resolve(eeDir), iaDir: path.resolve(iaDir), out };
+}
+
+async function main(): Promise<void> {
+	const args = parseArgs(process.argv.slice(2));
+	const [ee, ia] = await Promise.all([loadEERun(args.eeDir), loadInstanceAiRun(args.iaDir)]);
+
+	console.log(
+		`EE records: ${ee.records.length} (pass rate ${pct(ee.summary.totals.primaryPassRate)})`,
+	);
+	console.log(
+		`IA records: ${ia.records.length} (pass rate ${pct(ia.summary.totals.primaryPassRate)})`,
+	);
+
+	const rows = pairRecords(ee.records, ia.records);
+	const matched = rows.filter((r) => r.ee && r.ia).length;
+	console.log(`Joined ${rows.length} prompts (${matched} matched on both sides)`);
+
+	const html = renderDocument(ee, ia, rows);
+	await fs.writeFile(args.out, html, 'utf8');
+	console.log(`Wrote comparison report to ${args.out}`);
+}
+
+if (require.main === module) {
+	main().catch((error) => {
+		console.error(error instanceof Error ? (error.stack ?? error.message) : String(error));
+		process.exit(1);
+	});
+}
--- a/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/pairwise.ts
@ -0,0 +1,675 @@
+// ---------------------------------------------------------------------------
+// Pairwise eval CLI for instance-ai.
+//
+// Pulls the pairwise dataset (default: notion-pairwise-workflows) from
+// LangSmith or a local file, builds one workflow per example via the
+// in-process instance-ai agent, and scores the result with the same
+// pairwise judge panel used by ai-workflow-builder.ee.
+//
+// Results are written to an output directory so a later step can build
+// a head-to-head comparison report against the ai-workflow-builder.ee
+// baseline.
+// ---------------------------------------------------------------------------
+
+/* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-argument, @typescript-eslint/no-redundant-type-constituents, @typescript-eslint/no-base-to-string */
+// `SimpleWorkflow` is imported from `ai-workflow-builder.ee` via deep relative
+// paths; the `@/*` alias used inside that package collides with instance-ai's
+// own `@/*` mapping during transitive type-checking, so the type resolves to
+// `error` here. The `csvCell()` helper also calls `String(value)` on `unknown`
+// values by design.
+
+import { ChatAnthropic } from '@langchain/anthropic';
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { Client as LangSmithClient } from 'langsmith';
+import { promises as fs, readFileSync } from 'node:fs';
+import path from 'node:path';
+import pLimit from 'p-limit';
+
+import { loadRuns, renderDocument } from './report';
+import {
+	createPairwiseEvaluator,
+	type Feedback,
+	type SimpleWorkflow,
+} from '../../../ai-workflow-builder.ee/evaluations/evaluators/pairwise';
+import { DEFAULTS } from '../../../ai-workflow-builder.ee/evaluations/support/constants';
+import type { Logger } from '../../src/logger';
+import { BuilderSandboxFactory } from '../../src/workspace/builder-sandbox-factory';
+import type { SandboxConfig } from '../../src/workspace/create-workspace';
+import { SnapshotManager } from '../../src/workspace/snapshot-manager';
+import {
+	buildInProcess,
+	type InProcessBuildResult,
+	type ToolCallTrace,
+} from '../harness/in-process-builder';
+import { createLogger, type EvalLogger } from '../harness/logger';
+import { resolveSandboxConfig } from '../harness/sandbox-config';
+
+// ---------------------------------------------------------------------------
+// CLI args
+// ---------------------------------------------------------------------------
+
+interface PairwiseArgs {
+	dataset: string;
+	judges: number;
+	iterations: number;
+	concurrency: number;
+	maxExamples?: number;
+	exampleIds?: Set<string>;
+	timeoutMs: number;
+	outputDir: string;
+	judgeModel: string;
+	experimentName: string;
+	verbose: boolean;
+}
+
+function parseArgs(argv: string[]): PairwiseArgs {
+	const get = (flag: string): string | undefined => {
+		const idx = argv.indexOf(flag);
+		if (idx === -1) return undefined;
+		const value = argv[idx + 1];
+		return value && !value.startsWith('--') ? value : undefined;
+	};
+	const has = (flag: string): boolean => argv.includes(flag);
+
+	const iso = new Date().toISOString().replace(/[:.]/g, '-');
+	const defaultOutputDir = path.resolve(process.cwd(), '.output', 'pairwise', iso);
+
+	const exampleIdsFile = get('--example-ids-file');
+	let exampleIds: Set<string> | undefined;
+	if (exampleIdsFile) {
+		const content = readFileSync(exampleIdsFile, 'utf8');
+		const ids = content
+			.split('\n')
+			.map((s) => s.trim())
+			.filter((s) => s.length > 0 && !s.startsWith('#'));
+		exampleIds = new Set(ids);
+	}
+
+	return {
+		dataset: get('--dataset') ?? DEFAULTS.DATASET_NAME,
+		judges: parsePositiveInt(get('--judges'), '--judges') ?? Number(DEFAULTS.NUM_JUDGES),
+		iterations:
+			parsePositiveInt(get('--iterations'), '--iterations') ?? Number(DEFAULTS.REPETITIONS),
+		concurrency:
+			parsePositiveInt(get('--concurrency'), '--concurrency') ?? Number(DEFAULTS.CONCURRENCY),
+		maxExamples: parsePositiveInt(get('--max-examples'), '--max-examples'),
+		exampleIds,
+		timeoutMs:
+			parsePositiveNumber(get('--timeout-ms'), '--timeout-ms') ?? Number(DEFAULTS.TIMEOUT_MS),
+		outputDir: get('--output-dir') ?? defaultOutputDir,
+		judgeModel: get('--judge-model') ?? 'claude-sonnet-4-5-20250929',
+		experimentName: get('--experiment-name') ?? 'pairwise-evals-instance-ai',
+		verbose: has('--verbose'),
+	};
+}
+
+function parsePositiveInt(raw: string | undefined, name: string): number | undefined {
+	if (raw === undefined || raw === '') return undefined;
+	const n = Number(raw);
+	if (!Number.isFinite(n) || n <= 0 || !Number.isInteger(n)) {
+		throw new Error(`${name} must be a positive integer, got "${raw}".`);
+	}
+	return n;
+}
+
+function parsePositiveNumber(raw: string | undefined, name: string): number | undefined {
+	if (raw === undefined || raw === '') return undefined;
+	const n = Number(raw);
+	if (!Number.isFinite(n) || n <= 0) {
+		throw new Error(`${name} must be a positive number, got "${raw}".`);
+	}
+	return n;
+}
+
+// ---------------------------------------------------------------------------
+// Sandbox factory wiring
+// ---------------------------------------------------------------------------
+
+function createSandboxFactory(
+	config: SandboxConfig,
+	evalLogger: EvalLogger,
+): BuilderSandboxFactory {
+	if (!config.enabled) {
+		throw new Error(
+			'Sandbox config is unexpectedly disabled — eval runs always require a sandbox.',
+		);
+	}
+
+	const factoryLogger: Logger = {
+		debug: (message, meta) => evalLogger.verbose(`[sandbox] ${message}${formatMeta(meta)}`),
+		info: (message, meta) => evalLogger.verbose(`[sandbox] ${message}${formatMeta(meta)}`),
+		warn: (message, meta) => evalLogger.warn(`[sandbox] ${message}${formatMeta(meta)}`),
+		error: (message, meta) => evalLogger.error(`[sandbox] ${message}${formatMeta(meta)}`),
+	};
+
+	const imageManager =
+		config.provider === 'daytona'
+			? new SnapshotManager(config.image, factoryLogger, undefined)
+			: undefined;
+	return new BuilderSandboxFactory(config, imageManager, factoryLogger);
+}
+
+function formatMeta(meta: unknown): string {
+	if (!meta || typeof meta !== 'object') return '';
+	try {
+		return ` ${JSON.stringify(meta)}`;
+	} catch {
+		return '';
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Dataset loading
+// ---------------------------------------------------------------------------
+
+interface DatasetExample {
+	id: string;
+	prompt: string;
+	dos?: string;
+	donts?: string;
+}
+
+async function loadExamples(args: PairwiseArgs, logger: EvalLogger): Promise<DatasetExample[]> {
+	logger.info(`Fetching dataset "${args.dataset}" from LangSmith`);
+	const lsClient = new LangSmithClient();
+	const examples: DatasetExample[] = [];
+	const layoutCounts = { evals: 0, context: 0, none: 0 };
+	for await (const raw of lsClient.listExamples({ datasetName: args.dataset })) {
+		const inputs = isRecord(raw.inputs) ? raw.inputs : {};
+		// The notion-pairwise-workflows dataset stores criteria under
+		// `inputs.evals.{dos,donts}`. Older fixtures used `inputs.context.*`
+		// — read both paths so both layouts work.
+		let criteria: Record<string, unknown> = {};
+		if (isRecord(inputs.evals)) {
+			criteria = inputs.evals;
+			layoutCounts.evals++;
+		} else if (isRecord(inputs.context)) {
+			criteria = inputs.context;
+			layoutCounts.context++;
+		} else {
+			layoutCounts.none++;
+		}
+		const example: DatasetExample = {
+			id: raw.id,
+			prompt: typeof inputs.prompt === 'string' ? inputs.prompt : '',
+			dos: typeof criteria.dos === 'string' ? criteria.dos : undefined,
+			donts: typeof criteria.donts === 'string' ? criteria.donts : undefined,
+		};
+		if (!example.prompt) {
+			logger.warn(`Skipping example ${raw.id}: no prompt field`);
+			continue;
+		}
+		examples.push(example);
+	}
+	logger.verbose(
+		`Dataset criteria layout: evals=${layoutCounts.evals} context=${layoutCounts.context} none=${layoutCounts.none}`,
+	);
+	return examples;
+}
+
+// ---------------------------------------------------------------------------
+// Per-example runner
+// ---------------------------------------------------------------------------
+
+interface ExampleRecord {
+	exampleId: string;
+	iteration: number;
+	prompt: string;
+	dos?: string;
+	donts?: string;
+	workflow: SimpleWorkflow | null;
+	build: {
+		success: boolean;
+		errorClass?: string;
+		errorMessage?: string;
+		durationMs: number;
+		extraWorkflowCount: number;
+		interactivity: InProcessBuildResult['interactivity'];
+	};
+	toolCalls: ToolCallTrace[];
+	feedback: Feedback[];
+}
+
+/**
+ * Eval-only suffix appended to every dataset prompt. Pushes the agent past
+ * its production "ask before assuming / set up credentials first" instinct
+ * — there is no human in the loop, so a clarification turn is a guaranteed
+ * `no_workflow_built`. Lives in the harness, not the production builder
+ * prompt, so production behavior is unaffected.
+ *
+ * Strictly describes the eval environment and the required terminal action
+ * (call `submit-workflow`). Does not name SDK helpers or otherwise lead the
+ * agent toward specific implementation choices — those are what the eval
+ * measures.
+ */
+const EVAL_PROMPT_SUFFIX =
+	'\n\n---\n' +
+	'You are running inside an automated, non-interactive evaluation. ' +
+	'There is no human to answer follow-up questions. ' +
+	'Do not call `ask-user` and do not ask for clarification — pick reasonable defaults and proceed.';
+
+async function runExample(
+	example: DatasetExample,
+	iteration: number,
+	judgeLlm: BaseChatModel,
+	args: PairwiseArgs,
+	logger: EvalLogger,
+	sandboxFactory: BuilderSandboxFactory,
+): Promise<ExampleRecord> {
+	logger.verbose(`[${example.id} #${iteration}] building workflow...`);
+	const logPath = path.join(
+		args.outputDir,
+		'chunks',
+		`${safeFilename(`${example.id}_${iteration}`)}.jsonl`,
+	);
+	const build = await buildInProcess({
+		prompt: example.prompt + EVAL_PROMPT_SUFFIX,
+		timeoutMs: args.timeoutMs,
+		logPath,
+		sandboxFactory,
+	});
+
+	const record: ExampleRecord = {
+		exampleId: example.id,
+		iteration,
+		prompt: example.prompt,
+		dos: example.dos,
+		donts: example.donts,
+		workflow: build.workflow ?? null,
+		build: {
+			success: build.success,
+			errorClass: build.errorClass,
+			errorMessage: build.errorMessage,
+			durationMs: build.durationMs,
+			extraWorkflowCount: build.extraWorkflows.length,
+			interactivity: build.interactivity,
+		},
+		toolCalls: build.toolCalls,
+		feedback: [],
+	};
+
+	if (!build.workflow) {
+		logger.warn(
+			`[${example.id} #${iteration}] build failed (${build.errorClass ?? 'unknown'}): ${build.errorMessage ?? 'no details'}`,
+		);
+		return record;
+	}
+
+	try {
+		const evaluator = createPairwiseEvaluator(judgeLlm, { numJudges: args.judges });
+		const feedback = await evaluator.evaluate(build.workflow, {
+			prompt: example.prompt,
+			dos: example.dos,
+			donts: example.donts,
+		});
+		record.feedback = feedback;
+		const primary = feedback.find((f) => f.metric === 'pairwise_primary');
+		logger.info(
+			`[${example.id} #${iteration}] pairwise_primary=${primary?.score ?? 'n/a'} duration=${build.durationMs}ms`,
+		);
+	} catch (error) {
+		logger.error(
+			`[${example.id} #${iteration}] judge panel failed: ${error instanceof Error ? error.message : String(error)}`,
+		);
+	}
+
+	return record;
+}
+
+// ---------------------------------------------------------------------------
+// Output writing
+// ---------------------------------------------------------------------------
+
+interface Summary {
+	builder: 'instance-ai';
+	dataset: string;
+	judgeModel: string;
+	numJudges: number;
+	iterations: number;
+	experimentName: string;
+	startedAt: string;
+	finishedAt: string;
+	totals: {
+		examples: number;
+		runs: number;
+		buildSuccess: number;
+		buildFailures: Record<string, number>;
+		primaryPassRate: number;
+		avgDiagnostic: number;
+	};
+	interactivity: {
+		askUserCount: number;
+		planToolCount: number;
+		autoApprovedSuspensions: number;
+		mockedCredentialTypes: string[];
+	};
+	sandbox: { provider: string };
+}
+
+async function writeOutputs(
+	outputDir: string,
+	records: ExampleRecord[],
+	args: PairwiseArgs,
+	startedAt: Date,
+	finishedAt: Date,
+	logger: EvalLogger,
+	sandboxProvider: string,
+	silent = false,
+): Promise<Summary> {
+	await fs.mkdir(outputDir, { recursive: true });
+	await fs.mkdir(path.join(outputDir, 'workflows'), { recursive: true });
+
+	// results.jsonl + per-workflow files. Workflow JSON is immutable per
+	// (exampleId, iteration), so skip any file already on disk to avoid
+	// O(N²) rewrites across incremental flushes.
+	const jsonlPath = path.join(outputDir, 'results.jsonl');
+	const lines: string[] = [];
+	for (const record of records) {
+		lines.push(JSON.stringify(record));
+		if (record.workflow) {
+			const slug = safeFilename(`${record.exampleId}_${record.iteration}`);
+			const workflowPath = path.join(outputDir, 'workflows', `${slug}.json`);
+			if (!(await fileExists(workflowPath))) {
+				await fs.writeFile(workflowPath, JSON.stringify(record.workflow, null, 2), 'utf8');
+			}
+		}
+	}
+	await fs.writeFile(jsonlPath, lines.join('\n') + '\n', 'utf8');
+
+	// results.csv — flat metric columns for spreadsheet import
+	const csvPath = path.join(outputDir, 'results.csv');
+	const csvHeader = [
+		'exampleId',
+		'iteration',
+		'buildSuccess',
+		'buildError',
+		'durationMs',
+		'askUserCount',
+		'planToolCount',
+		'pairwisePrimary',
+		'pairwiseDiagnostic',
+		'pairwiseJudgesPassed',
+	].join(',');
+	const csvRows = records.map((r) => {
+		const find = (m: string) => r.feedback.find((f) => f.metric === m)?.score ?? '';
+		return [
+			r.exampleId,
+			r.iteration,
+			r.build.success ? 1 : 0,
+			r.build.errorClass ?? '',
+			r.build.durationMs,
+			r.build.interactivity.askUserCount,
+			r.build.interactivity.planToolCount,
+			find('pairwise_primary'),
+			find('pairwise_diagnostic'),
+			find('pairwise_judges_passed'),
+		]
+			.map(csvCell)
+			.join(',');
+	});
+	await fs.writeFile(csvPath, [csvHeader, ...csvRows].join('\n') + '\n', 'utf8');
+
+	// summary.json
+	const buildFailures: Record<string, number> = {};
+	let buildSuccess = 0;
+	let primaryPassSum = 0;
+	let primaryPassCount = 0;
+	let diagnosticSum = 0;
+	let diagnosticCount = 0;
+	const allMockedCreds = new Set<string>();
+	let askUserCount = 0;
+	let planToolCount = 0;
+	let autoApprovedSuspensions = 0;
+
+	for (const record of records) {
+		if (record.build.success) buildSuccess++;
+		if (record.build.errorClass) {
+			buildFailures[record.build.errorClass] = (buildFailures[record.build.errorClass] ?? 0) + 1;
+		}
+		askUserCount += record.build.interactivity.askUserCount;
+		planToolCount += record.build.interactivity.planToolCount;
+		autoApprovedSuspensions += record.build.interactivity.autoApprovedSuspensions;
+		for (const type of record.build.interactivity.mockedCredentialTypes) {
+			allMockedCreds.add(type);
+		}
+
+		const primary = record.feedback.find((f) => f.metric === 'pairwise_primary')?.score;
+		if (typeof primary === 'number') {
+			primaryPassSum += primary;
+			primaryPassCount++;
+		} else if (!record.build.success) {
+			// A build failure means the agent had its chance and produced no
+			// workflow — that's a failed attempt at the pairwise criteria, not
+			// a measurement gap. Count it as 0 so the pass rate isn't inflated
+			// by silently dropping failures from the denominator. Judge errors
+			// (build succeeded but the panel threw) are still excluded — those
+			// are tooling problems, not builder problems.
+			primaryPassCount++;
+		}
+		const diag = record.feedback.find((f) => f.metric === 'pairwise_diagnostic')?.score;
+		if (typeof diag === 'number') {
+			diagnosticSum += diag;
+			diagnosticCount++;
+		}
+	}
+
+	const summary: Summary = {
+		builder: 'instance-ai',
+		dataset: args.dataset,
+		judgeModel: args.judgeModel,
+		numJudges: args.judges,
+		iterations: args.iterations,
+		experimentName: args.experimentName,
+		startedAt: startedAt.toISOString(),
+		finishedAt: finishedAt.toISOString(),
+		totals: {
+			examples: new Set(records.map((r) => r.exampleId)).size,
+			runs: records.length,
+			buildSuccess,
+			buildFailures,
+			primaryPassRate: primaryPassCount ? primaryPassSum / primaryPassCount : 0,
+			avgDiagnostic: diagnosticCount ? diagnosticSum / diagnosticCount : 0,
+		},
+		interactivity: {
+			askUserCount,
+			planToolCount,
+			autoApprovedSuspensions,
+			mockedCredentialTypes: Array.from(allMockedCreds),
+		},
+		sandbox: { provider: sandboxProvider },
+	};
+	await fs.writeFile(
+		path.join(outputDir, 'summary.json'),
+		JSON.stringify(summary, null, 2),
+		'utf8',
+	);
+	if (!silent) {
+		logger.success(`Wrote ${records.length} results to ${outputDir}`);
+	}
+	return summary;
+}
+
+/**
+ * Regenerate the cross-run HTML report from `<reportRoot>/*` so the page
+ * stays current as examples complete. Best-effort: a render failure is
+ * logged but does not abort the run.
+ */
+async function regenerateReport(
+	reportRoot: string,
+	reportFile: string,
+	logger: EvalLogger,
+): Promise<void> {
+	try {
+		const runs = await loadRuns(reportRoot);
+		if (runs.length === 0) return;
+		const html = renderDocument(runs);
+		await fs.writeFile(reportFile, html, 'utf8');
+		logger.verbose(`Regenerated report (${runs.length} run(s)) at ${reportFile}`);
+	} catch (error) {
+		logger.warn(
+			`Report regeneration failed: ${error instanceof Error ? error.message : String(error)}`,
+		);
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+async function main(): Promise<void> {
+	const args = parseArgs(process.argv.slice(2));
+	const logger = createLogger(args.verbose);
+	logger.info(
+		`pairwise eval: dataset=${args.dataset} judges=${args.judges} iterations=${args.iterations}`,
+	);
+
+	const apiKey = process.env.N8N_AI_ANTHROPIC_KEY ?? process.env.ANTHROPIC_API_KEY;
+	if (!apiKey) {
+		throw new Error(
+			'Set N8N_AI_ANTHROPIC_KEY or ANTHROPIC_API_KEY — both the builder agent and the judge LLM need it.',
+		);
+	}
+
+	const sandboxConfig = resolveSandboxConfig(process.env);
+	const sandboxFactory = createSandboxFactory(sandboxConfig, logger);
+	if (!sandboxConfig.enabled) {
+		throw new Error('resolveSandboxConfig returned a disabled config — this should never happen.');
+	}
+	logger.info(
+		`Sandbox: provider=${sandboxConfig.provider} (workflow built via TypeScript file + tsc)`,
+	);
+
+	const judgeLlm = new ChatAnthropic({
+		model: args.judgeModel,
+		apiKey,
+		temperature: 0,
+		maxTokens: 8192,
+	});
+
+	const examples = await loadExamples(args, logger);
+	let filtered = examples;
+	if (args.exampleIds) {
+		const ids = args.exampleIds;
+		filtered = examples.filter((e) => ids.has(e.id));
+		const missing = Array.from(ids).filter((id) => !examples.some((e) => e.id === id));
+		logger.info(
+			`Filtered to ${filtered.length} examples by --example-ids-file (${ids.size} requested${missing.length ? `, ${missing.length} not found` : ''})`,
+		);
+		if (missing.length) {
+			logger.warn(
+				`Missing IDs: ${missing.slice(0, 5).join(', ')}${missing.length > 5 ? ', ...' : ''}`,
+			);
+		}
+	}
+	const selected = args.maxExamples !== undefined ? filtered.slice(0, args.maxExamples) : filtered;
+	logger.info(`Running ${selected.length} examples x ${args.iterations} iterations`);
+
+	const limit = pLimit(args.concurrency);
+	const records: ExampleRecord[] = [];
+	const startedAt = new Date();
+	const reportRoot = path.dirname(args.outputDir);
+	const reportFile = path.join(reportRoot, 'report.html');
+
+	// Serialize incremental writes so concurrent example completions don't
+	// race on the same output files.
+	let writeQueue: Promise<unknown> = Promise.resolve();
+	const flushIncremental = async (): Promise<unknown> => {
+		writeQueue = writeQueue.then(async () => {
+			const snapshot = [...records].sort((a, b) =>
+				a.exampleId === b.exampleId
+					? a.iteration - b.iteration
+					: a.exampleId.localeCompare(b.exampleId),
+			);
+			await writeOutputs(
+				args.outputDir,
+				snapshot,
+				args,
+				startedAt,
+				new Date(),
+				logger,
+				sandboxConfig.provider,
+				true,
+			);
+			await regenerateReport(reportRoot, reportFile, logger);
+		});
+		return await writeQueue;
+	};
+
+	const work: Array<Promise<void>> = [];
+	for (const example of selected) {
+		for (let i = 1; i <= args.iterations; i++) {
+			work.push(
+				limit(async () => {
+					const record = await runExample(example, i, judgeLlm, args, logger, sandboxFactory);
+					records.push(record);
+					await flushIncremental();
+				}),
+			);
+		}
+	}
+	await Promise.all(work);
+	await writeQueue;
+
+	const finishedAt = new Date();
+	records.sort((a, b) =>
+		a.exampleId === b.exampleId
+			? a.iteration - b.iteration
+			: a.exampleId.localeCompare(b.exampleId),
+	);
+	await writeOutputs(
+		args.outputDir,
+		records,
+		args,
+		startedAt,
+		finishedAt,
+		logger,
+		sandboxConfig.provider,
+	);
+	await regenerateReport(reportRoot, reportFile, logger);
+	logger.info(`Report: ${reportFile}`);
+	logger.info(
+		`Note: LangSmith feedback upload is not yet wired up — scores are in ${args.outputDir}. ` +
+			'Run scripts/upload-pairwise-to-langsmith.ts against summary.json to push results.',
+	);
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+	return value !== null && typeof value === 'object' && !Array.isArray(value);
+}
+
+function safeFilename(s: string): string {
+	return s.replace(/[^a-zA-Z0-9._-]+/g, '_').slice(0, 120);
+}
+
+async function fileExists(filePath: string): Promise<boolean> {
+	try {
+		await fs.access(filePath);
+		return true;
+	} catch {
+		return false;
+	}
+}
+
+function csvCell(value: unknown): string {
+	if (value === null || value === undefined) return '';
+	const str = String(value);
+	if (str.includes(',') || str.includes('"') || str.includes('\n') || str.includes('\r')) {
+		return '"' + str.replace(/"/g, '""') + '"';
+	}
+	return str;
+}
+
+// ---------------------------------------------------------------------------
+// Entry point
+// ---------------------------------------------------------------------------
+
+if (require.main === module) {
+	main().catch((error) => {
+		console.error(error instanceof Error ? (error.stack ?? error.message) : String(error));
+		process.exit(1);
+	});
+}
--- a/packages/@n8n/instance-ai/evaluations/cli/report.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/report.ts
@ -0,0 +1,627 @@
+// ---------------------------------------------------------------------------
+// Generate an HTML report from all saved pairwise eval runs.
+//
+// Walks `<output-root>/pairwise/*` (default `.output/pairwise/`), reads
+// every run's `summary.json` + `results.jsonl`, and produces one HTML
+// file with a run picker and per-example details. Each built workflow is
+// embedded as an `<n8n-demo>` web component so reviewers can poke at the
+// canvas inline.
+//
+// https://github.com/n8n-io/n8n-demo-webcomponent
+// ---------------------------------------------------------------------------
+
+import { jsonParse } from 'n8n-workflow';
+import { promises as fs } from 'node:fs';
+import path from 'node:path';
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface SummaryJson {
+	builder: string;
+	dataset: string;
+	judgeModel: string;
+	numJudges: number;
+	iterations: number;
+	experimentName: string;
+	startedAt: string;
+	finishedAt: string;
+	totals: {
+		examples: number;
+		runs: number;
+		buildSuccess: number;
+		buildFailures: Record<string, number>;
+		primaryPassRate: number;
+		avgDiagnostic: number;
+	};
+	interactivity: {
+		askUserCount: number;
+		planToolCount: number;
+		autoApprovedSuspensions: number;
+		mockedCredentialTypes: string[];
+	};
+}
+
+interface FeedbackEntry {
+	evaluator: string;
+	metric: string;
+	score: number;
+	kind?: string;
+	comment?: string;
+}
+
+interface ToolCallSuspension {
+	message?: string;
+	questions?: unknown;
+	severity?: string;
+	autoApproved: boolean;
+}
+
+interface ToolCallTrace {
+	step: number;
+	toolCallId: string;
+	toolName: string;
+	args?: unknown;
+	result?: unknown;
+	error?: string;
+	elapsedMs?: number;
+	suspension?: ToolCallSuspension;
+}
+
+interface ResultRecord {
+	exampleId: string;
+	iteration: number;
+	prompt: string;
+	dos?: string;
+	donts?: string;
+	workflow: unknown;
+	build: {
+		success: boolean;
+		errorClass?: string;
+		errorMessage?: string;
+		durationMs: number;
+		extraWorkflowCount: number;
+		interactivity: {
+			askUserCount: number;
+			planToolCount: number;
+			autoApprovedSuspensions: number;
+			mockedCredentialTypes: string[];
+		};
+	};
+	/** Optional — older runs predate the field. */
+	toolCalls?: ToolCallTrace[];
+	feedback: FeedbackEntry[];
+}
+
+interface Run {
+	dirName: string;
+	summary: SummaryJson;
+	results: ResultRecord[];
+}
+
+// ---------------------------------------------------------------------------
+// Discovery
+// ---------------------------------------------------------------------------
+
+export async function loadRuns(rootDir: string): Promise<Run[]> {
+	const entries = await fs.readdir(rootDir, { withFileTypes: true });
+	const runs: Run[] = [];
+	for (const entry of entries) {
+		if (!entry.isDirectory()) continue;
+		const dir = path.join(rootDir, entry.name);
+		const summaryPath = path.join(dir, 'summary.json');
+		const resultsPath = path.join(dir, 'results.jsonl');
+		let summaryRaw: string;
+		let resultsRaw: string;
+		try {
+			[summaryRaw, resultsRaw] = await Promise.all([
+				fs.readFile(summaryPath, 'utf8'),
+				fs.readFile(resultsPath, 'utf8'),
+			]);
+		} catch (error) {
+			// Incomplete/aborted runs lack one of the two files — skip those
+			// silently. Any other read failure (permissions, I/O) should surface.
+			if (isMissingFileError(error)) continue;
+			throw error;
+		}
+		const summary = jsonParse<SummaryJson>(summaryRaw, {
+			errorMessage: `Failed to parse ${summaryPath}`,
+		});
+		const results = resultsRaw
+			.split('\n')
+			.filter((line) => line.trim().length > 0)
+			.map((line) =>
+				jsonParse<ResultRecord>(line, {
+					errorMessage: `Failed to parse a line in ${resultsPath}`,
+				}),
+			);
+		runs.push({ dirName: entry.name, summary, results });
+	}
+	runs.sort((a, b) => b.summary.startedAt.localeCompare(a.summary.startedAt));
+	return runs;
+}
+
+function isMissingFileError(error: unknown): boolean {
+	return (
+		typeof error === 'object' &&
+		error !== null &&
+		'code' in error &&
+		(error as { code: unknown }).code === 'ENOENT'
+	);
+}
+
+// ---------------------------------------------------------------------------
+// Rendering
+// ---------------------------------------------------------------------------
+
+function escapeHtml(input: string): string {
+	return input
+		.replace(/&/g, '&amp;')
+		.replace(/</g, '&lt;')
+		.replace(/>/g, '&gt;')
+		.replace(/"/g, '&quot;')
+		.replace(/'/g, '&#39;');
+}
+
+function escapeAttr(input: string): string {
+	return input.replace(/&/g, '&amp;').replace(/'/g, '&apos;').replace(/"/g, '&quot;');
+}
+
+function findScore(feedback: FeedbackEntry[], metric: string): number | undefined {
+	return feedback.find((f) => f.metric === metric)?.score;
+}
+
+function renderCriteriaList(raw: string | undefined, kind: 'do' | 'dont'): string {
+	if (!raw) return '';
+	const lines = raw
+		.split('\n')
+		.map((line) => line.trim())
+		.filter((line) => line.length > 0);
+	if (lines.length === 0) return '';
+	const items = lines.map((line) => `<li>${escapeHtml(line)}</li>`).join('');
+	const label = kind === 'do' ? 'Do' : "Don't";
+	return `<div class="criteria ${kind}"><h4>${label}</h4><ul>${items}</ul></div>`;
+}
+
+function renderFeedbackBadges(feedback: FeedbackEntry[]): string {
+	const primary = findScore(feedback, 'pairwise_primary');
+	const diagnostic = findScore(feedback, 'pairwise_diagnostic');
+	const judgesPassed = findScore(feedback, 'pairwise_judges_passed');
+	const totalPasses = findScore(feedback, 'pairwise_total_passes');
+	const totalViolations = findScore(feedback, 'pairwise_total_violations');
+
+	const badges: string[] = [];
+	if (primary !== undefined) {
+		const cls = primary === 1 ? 'badge-pass' : 'badge-fail';
+		badges.push(`<span class="badge ${cls}">primary ${primary}</span>`);
+	}
+	if (diagnostic !== undefined) {
+		badges.push(`<span class="badge badge-neutral">diagnostic ${diagnostic.toFixed(2)}</span>`);
+	}
+	if (judgesPassed !== undefined) {
+		badges.push(`<span class="badge badge-neutral">${judgesPassed} judges pass</span>`);
+	}
+	if (totalPasses !== undefined && totalViolations !== undefined) {
+		badges.push(
+			`<span class="badge badge-neutral">${totalPasses} passes / ${totalViolations} violations</span>`,
+		);
+	}
+	return badges.join('');
+}
+
+function renderJudgeComments(feedback: FeedbackEntry[]): string {
+	const judges = feedback.filter((f) => /^judge\d+$/.test(f.metric));
+	if (judges.length === 0) return '';
+	const rows = judges
+		.map((j) => {
+			const cls = j.score === 1 ? 'judge-pass' : 'judge-fail';
+			const comment = j.comment ? escapeHtml(j.comment) : '<em>no violations</em>';
+			return `<tr><td class="${cls}">${escapeHtml(j.metric)}</td><td>${j.score}</td><td>${comment}</td></tr>`;
+		})
+		.join('');
+	return `<table class="judges"><thead><tr><th>Judge</th><th>Pass</th><th>Notes</th></tr></thead><tbody>${rows}</tbody></table>`;
+}
+
+function formatJson(value: unknown): string {
+	if (value === undefined) return '';
+	if (typeof value === 'string') return value;
+	try {
+		return JSON.stringify(value, null, 2);
+	} catch {
+		// Fallback when `value` is non-serialisable (e.g. has a circular ref).
+		// `String(value)` may produce '[object Object]' but it's the only way
+		// to surface *something* in the report instead of throwing.
+		// eslint-disable-next-line @typescript-eslint/no-base-to-string
+		return String(value);
+	}
+}
+
+function renderToolCallTimeline(toolCalls: ToolCallTrace[] | undefined): string {
+	if (!toolCalls || toolCalls.length === 0) {
+		return '<div class="no-tools">No tool calls recorded.</div>';
+	}
+	const items = toolCalls
+		.map((trace) => {
+			const elapsed =
+				typeof trace.elapsedMs === 'number' ? `${trace.elapsedMs}ms` : '<em>pending</em>';
+			const stateBits: string[] = [];
+			if (trace.error) stateBits.push('<span class="tool-state error">error</span>');
+			else if (trace.result !== undefined) stateBits.push('<span class="tool-state ok">ok</span>');
+			else stateBits.push('<span class="tool-state pending">pending</span>');
+			if (trace.suspension) {
+				stateBits.push(
+					trace.suspension.autoApproved
+						? '<span class="tool-state suspended auto">auto-approved</span>'
+						: '<span class="tool-state suspended">suspended</span>',
+				);
+			}
+
+			const blocks: string[] = [];
+			if (trace.suspension) {
+				const suspParts: string[] = [];
+				if (trace.suspension.message) {
+					suspParts.push(`<div class="tool-message">${escapeHtml(trace.suspension.message)}</div>`);
+				}
+				if (trace.suspension.questions) {
+					suspParts.push(
+						`<details class="tool-block tool-questions"><summary>Questions asked</summary><pre>${escapeHtml(formatJson(trace.suspension.questions))}</pre></details>`,
+					);
+				}
+				blocks.push(`<div class="tool-suspension">${suspParts.join('')}</div>`);
+			}
+			if (trace.args !== undefined) {
+				blocks.push(
+					`<details class="tool-block tool-args"><summary>Input</summary><pre>${escapeHtml(formatJson(trace.args))}</pre></details>`,
+				);
+			}
+			if (trace.error) {
+				blocks.push(
+					`<details class="tool-block tool-error" open><summary>Error</summary><pre>${escapeHtml(trace.error)}</pre></details>`,
+				);
+			} else if (trace.result !== undefined) {
+				blocks.push(
+					`<details class="tool-block tool-result"><summary>Output</summary><pre>${escapeHtml(formatJson(trace.result))}</pre></details>`,
+				);
+			}
+
+			return `<li class="tool-call">
+  <header class="tool-call-header">
+    <span class="tool-step">#${trace.step}</span>
+    <span class="tool-name">${escapeHtml(trace.toolName)}</span>
+    <span class="tool-elapsed">${elapsed}</span>
+    <span class="tool-states">${stateBits.join('')}</span>
+  </header>
+  ${blocks.join('')}
+</li>`;
+		})
+		.join('');
+	return `<ol class="tool-calls">${items}</ol>`;
+}
+
+function renderWorkflow(workflow: unknown): string {
+	if (!workflow) {
+		return '<div class="no-workflow">No workflow built.</div>';
+	}
+	const json = JSON.stringify(workflow);
+	// Lazy mount: store the workflow on a placeholder and let the inline
+	// script inject the <n8n-demo> element when the parent <details> is
+	// expanded. Rendering all 77 demos upfront kills first-paint performance.
+	return `<div class="workflow-mount" data-workflow="${escapeAttr(json)}"></div>`;
+}
+
+function renderExample(record: ResultRecord, idPrefix: string): string {
+	const primary = findScore(record.feedback, 'pairwise_primary');
+	const statusCls =
+		record.build.success && primary === 1
+			? 'ex-pass'
+			: record.build.success
+				? 'ex-partial'
+				: 'ex-fail';
+	const statusLabel = !record.build.success
+		? `BUILD ${record.build.errorClass ?? 'FAILED'}`
+		: primary === 1
+			? 'PASS'
+			: 'FAIL';
+	const exampleId = `${idPrefix}-${record.exampleId}-${record.iteration}`;
+	const interact = record.build.interactivity;
+	const interactBits: string[] = [];
+	if (interact.askUserCount > 0) interactBits.push(`ask-user ×${interact.askUserCount}`);
+	if (interact.planToolCount > 0) interactBits.push(`plan ×${interact.planToolCount}`);
+	if (interact.autoApprovedSuspensions > 0)
+		interactBits.push(`suspend ×${interact.autoApprovedSuspensions}`);
+	if (interact.mockedCredentialTypes.length > 0)
+		interactBits.push(`mocked creds: ${interact.mockedCredentialTypes.join(', ')}`);
+
+	const errorBlock = record.build.errorMessage
+		? `<div class="error">${escapeHtml(record.build.errorMessage)}</div>`
+		: '';
+
+	const promptPreview = record.prompt.replace(/\s+/g, ' ').trim();
+
+	return `
+<details class="example ${statusCls}" id="${escapeAttr(exampleId)}">
+  <summary>
+    <span class="status">${statusLabel}</span>
+    <div class="summary-text">
+      <span class="prompt-preview" title="${escapeAttr(promptPreview)}">${escapeHtml(promptPreview)}</span>
+      <span class="example-id">${escapeHtml(record.exampleId)}</span>
+    </div>
+    <span class="iteration">#${record.iteration}</span>
+    <span class="duration">${record.build.durationMs}ms</span>
+    <span class="badges">${renderFeedbackBadges(record.feedback)}</span>
+  </summary>
+  <div class="body">
+    <section class="prompt">
+      <h3>Prompt</h3>
+      <pre>${escapeHtml(record.prompt)}</pre>
+    </section>
+    <section class="criteria-row">
+      ${renderCriteriaList(record.dos, 'do')}
+      ${renderCriteriaList(record.donts, 'dont')}
+    </section>
+    ${errorBlock}
+    ${interactBits.length > 0 ? `<div class="interactivity">${interactBits.map(escapeHtml).join(' · ')}</div>` : ''}
+    <section class="workflow-section">
+      <h3>Built workflow</h3>
+      ${renderWorkflow(record.workflow)}
+    </section>
+    <details class="tool-calls-section">
+      <summary><h3>Tool calls${record.toolCalls && record.toolCalls.length > 0 ? ` (${record.toolCalls.length})` : ''}</h3></summary>
+      ${renderToolCallTimeline(record.toolCalls)}
+    </details>
+    ${renderJudgeComments(record.feedback)}
+  </div>
+</details>`;
+}
+
+function renderRun(run: Run, index: number): string {
+	const s = run.summary;
+	const pct = (n: number): string => `${(n * 100).toFixed(1)}%`;
+	const totalFailures = Object.values(s.totals.buildFailures).reduce((a, b) => a + b, 0);
+	const failureDetail = Object.entries(s.totals.buildFailures)
+		.map(([k, v]) => `${k}: ${v}`)
+		.join(', ');
+
+	const examples = run.results
+		.sort((a, b) =>
+			a.exampleId === b.exampleId
+				? a.iteration - b.iteration
+				: a.exampleId.localeCompare(b.exampleId),
+		)
+		.map((r) => renderExample(r, `run-${index}`))
+		.join('\n');
+
+	return `
+<section class="run" id="run-${index}">
+  <header class="run-header">
+    <h2>${escapeHtml(s.experimentName)}</h2>
+    <div class="run-meta">
+      <span><strong>Builder:</strong> ${escapeHtml(s.builder)}</span>
+      <span><strong>Dataset:</strong> ${escapeHtml(s.dataset)}</span>
+      <span><strong>Judges:</strong> ${s.numJudges}</span>
+      <span><strong>Judge model:</strong> ${escapeHtml(s.judgeModel)}</span>
+      <span><strong>Iterations:</strong> ${s.iterations}</span>
+      <span><strong>Started:</strong> ${escapeHtml(s.startedAt)}</span>
+      <span><strong>Dir:</strong> <code>${escapeHtml(run.dirName)}</code></span>
+    </div>
+    <div class="run-totals">
+      <span class="total"><strong>Examples:</strong> ${s.totals.examples}</span>
+      <span class="total"><strong>Runs:</strong> ${s.totals.runs}</span>
+      <span class="total success"><strong>Build ok:</strong> ${s.totals.buildSuccess}</span>
+      <span class="total ${totalFailures > 0 ? 'fail' : ''}"><strong>Build fail:</strong> ${totalFailures}${failureDetail ? ` (${escapeHtml(failureDetail)})` : ''}</span>
+      <span class="total"><strong>Primary pass rate:</strong> ${pct(s.totals.primaryPassRate)}</span>
+      <span class="total"><strong>Avg diagnostic:</strong> ${s.totals.avgDiagnostic.toFixed(2)}</span>
+    </div>
+    ${
+			s.interactivity.askUserCount > 0 ||
+			s.interactivity.planToolCount > 0 ||
+			s.interactivity.autoApprovedSuspensions > 0 ||
+			s.interactivity.mockedCredentialTypes.length > 0
+				? `<div class="run-interactivity">
+          <strong>Interactivity:</strong>
+          ask-user ×${s.interactivity.askUserCount} ·
+          plan ×${s.interactivity.planToolCount} ·
+          suspend ×${s.interactivity.autoApprovedSuspensions} ·
+          mocked creds: ${s.interactivity.mockedCredentialTypes.map(escapeHtml).join(', ') || 'none'}
+        </div>`
+				: ''
+		}
+  </header>
+  <div class="examples">${examples}</div>
+</section>`;
+}
+
+export function renderDocument(runs: Run[]): string {
+	const body = runs.map((run, i) => renderRun(run, i)).join('\n');
+
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8" />
+<title>Instance AI — Pairwise Eval Report</title>
+<script src="https://cdn.jsdelivr.net/npm/@webcomponents/webcomponentsjs@2.0.0/webcomponents-loader.js"></script>
+<script src="https://www.unpkg.com/lit@2.0.0-rc.2/polyfill-support.js"></script>
+<script type="module" src="https://cdn.jsdelivr.net/npm/@n8n_io/n8n-demo-component/n8n-demo.bundled.js"></script>
+<style>
+  :root {
+    font-family: ui-sans-serif, system-ui, -apple-system, sans-serif;
+    color-scheme: dark;
+    --bg: #0d1117;
+    --fg: #e6edf3;
+    --muted: #8b949e;
+    --border: #30363d;
+    --card: #161b22;
+    --subtle: #1c2129;
+    --pass: #3fb950;
+    --partial: #d29922;
+    --fail: #f85149;
+    --accent: #7c8cff;
+  }
+  body { margin: 0; background: var(--bg); color: var(--fg); }
+  header.top { position: sticky; top: 0; background: var(--card); border-bottom: 1px solid var(--border); padding: 12px 20px; z-index: 10; }
+  header.top h1 { margin: 0; font-size: 18px; }
+  main { padding: 20px; display: flex; flex-direction: column; gap: 32px; max-width: 1400px; margin: 0 auto; }
+  section.run { background: var(--card); border: 1px solid var(--border); border-radius: 8px; overflow: hidden; }
+  section.run header.run-header { padding: 16px 20px; border-bottom: 1px solid var(--border); background: var(--subtle); }
+  section.run header.run-header h2 { margin: 0 0 8px 0; font-size: 16px; }
+  .run-meta, .run-totals, .run-interactivity { display: flex; flex-wrap: wrap; gap: 16px; font-size: 12px; color: var(--muted); }
+  .run-totals { margin-top: 8px; font-size: 13px; color: var(--fg); }
+  .run-totals .total.success strong { color: var(--pass); }
+  .run-totals .total.fail strong { color: var(--fail); }
+  .run-interactivity { margin-top: 8px; }
+  .examples { display: flex; flex-direction: column; }
+  details.example { border-bottom: 1px solid var(--border); }
+  details.example:last-child { border-bottom: none; }
+  details.example > summary {
+    list-style: none;
+    cursor: pointer;
+    padding: 10px 20px;
+    display: grid;
+    grid-template-columns: 160px minmax(0, 1fr) 40px 80px auto;
+    gap: 12px;
+    align-items: center;
+    font-size: 13px;
+  }
+  details.example > summary:hover { background: var(--subtle); }
+  details.example > summary::-webkit-details-marker { display: none; }
+  details.example > summary .status {
+    font-weight: 700;
+    font-size: 11px;
+    padding: 2px 6px;
+    border-radius: 3px;
+    letter-spacing: 0.03em;
+    text-align: center;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+  }
+  details.example > summary .summary-text { display: flex; flex-direction: column; gap: 2px; min-width: 0; }
+  details.example > summary .prompt-preview { font-size: 13px; color: var(--fg); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+  details.ex-pass > summary .status { background: rgba(63,185,80,0.18); color: var(--pass); }
+  details.ex-partial > summary .status { background: rgba(210,153,34,0.18); color: var(--partial); }
+  details.ex-fail > summary .status { background: rgba(248,81,73,0.18); color: var(--fail); }
+  details.example > summary .example-id { font-family: ui-monospace, monospace; font-size: 11px; color: var(--muted); overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+  details.example > summary .iteration { color: var(--muted); font-size: 11px; }
+  details.example > summary .duration { color: var(--muted); font-size: 11px; text-align: right; }
+  details.example > summary .badges { display: flex; gap: 6px; flex-wrap: wrap; justify-content: flex-end; }
+  .badge { font-size: 11px; padding: 2px 6px; border-radius: 3px; background: rgba(139,148,158,0.18); color: var(--fg); }
+  .badge.badge-pass { background: rgba(63,185,80,0.2); color: var(--pass); }
+  .badge.badge-fail { background: rgba(248,81,73,0.2); color: var(--fail); }
+  details.example > .body { padding: 16px 20px 24px; background: var(--subtle); }
+  details.example > .body h3 { margin: 16px 0 6px 0; font-size: 13px; text-transform: uppercase; color: var(--muted); letter-spacing: 0.05em; }
+  details.example pre { background: var(--bg); border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; font-size: 12px; white-space: pre-wrap; color: var(--fg); }
+  .criteria-row { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 8px; }
+  .criteria { border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; background: var(--card); }
+  .criteria h4 { margin: 0 0 4px 0; font-size: 11px; text-transform: uppercase; letter-spacing: 0.05em; }
+  .criteria.do h4 { color: var(--pass); }
+  .criteria.dont h4 { color: var(--fail); }
+  .criteria ul { margin: 0; padding-left: 18px; font-size: 12px; }
+  .error { margin-top: 8px; padding: 8px 12px; background: rgba(248,81,73,0.12); color: var(--fail); border-radius: 4px; font-size: 12px; white-space: pre-wrap; }
+  .interactivity { margin-top: 8px; font-size: 11px; color: var(--muted); }
+  .workflow-section { margin-top: 8px; }
+  n8n-demo, .workflow-mount { display: block; height: 380px; border: 1px solid var(--border); border-radius: 4px; background: #fff; color-scheme: light; }
+  .no-workflow { padding: 40px; text-align: center; color: var(--muted); font-size: 13px; border: 1px dashed var(--border); border-radius: 4px; }
+  table.judges { margin-top: 12px; width: 100%; border-collapse: collapse; font-size: 12px; background: var(--card); border: 1px solid var(--border); border-radius: 4px; overflow: hidden; }
+  table.judges th, table.judges td { padding: 6px 10px; text-align: left; border-bottom: 1px solid var(--border); }
+  table.judges tr:last-child td { border-bottom: none; }
+  table.judges td.judge-pass { color: var(--pass); font-weight: 600; }
+  table.judges td.judge-fail { color: var(--fail); font-weight: 600; }
+  .tool-calls-section { margin-top: 12px; }
+  details.tool-calls-section > summary { cursor: pointer; list-style: none; }
+  details.tool-calls-section > summary::-webkit-details-marker { display: none; }
+  details.tool-calls-section > summary h3 { display: inline; }
+  details.tool-calls-section > summary h3::before { content: '▸ '; color: var(--muted); }
+  details.tool-calls-section[open] > summary h3::before { content: '▾ '; }
+  .no-tools { color: var(--muted); font-size: 12px; padding: 8px 12px; background: var(--card); border: 1px dashed var(--border); border-radius: 4px; }
+  ol.tool-calls { list-style: none; padding: 0; margin: 0; display: flex; flex-direction: column; gap: 8px; }
+  li.tool-call { background: var(--card); border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; }
+  .tool-call-header { display: grid; grid-template-columns: 36px minmax(0, 1fr) 80px auto; gap: 10px; align-items: center; font-size: 12px; }
+  .tool-step { color: var(--muted); font-family: ui-monospace, monospace; }
+  .tool-name { font-weight: 600; color: var(--fg); font-family: ui-monospace, monospace; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+  .tool-elapsed { color: var(--muted); font-size: 11px; text-align: right; }
+  .tool-states { display: flex; gap: 4px; flex-wrap: wrap; justify-content: flex-end; }
+  .tool-state { font-size: 10px; padding: 2px 6px; border-radius: 3px; letter-spacing: 0.03em; text-transform: uppercase; font-weight: 600; }
+  .tool-state.ok { background: rgba(63,185,80,0.18); color: var(--pass); }
+  .tool-state.error { background: rgba(248,81,73,0.18); color: var(--fail); }
+  .tool-state.pending { background: rgba(139,148,158,0.18); color: var(--muted); }
+  .tool-state.suspended { background: rgba(210,153,34,0.18); color: var(--partial); }
+  .tool-state.suspended.auto { background: rgba(124,140,255,0.18); color: var(--accent); }
+  .tool-suspension { margin-top: 6px; padding: 6px 10px; background: rgba(210,153,34,0.08); border-left: 2px solid var(--partial); border-radius: 2px; }
+  .tool-message { font-size: 12px; color: var(--fg); white-space: pre-wrap; }
+  .tool-block { margin-top: 6px; }
+  .tool-block > summary { cursor: pointer; font-size: 11px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.05em; padding: 2px 0; }
+  .tool-block > summary:hover { color: var(--accent); }
+  .tool-block pre { background: var(--bg); border: 1px solid var(--border); border-radius: 4px; padding: 8px 12px; font-size: 11px; max-height: 320px; overflow: auto; white-space: pre-wrap; word-break: break-word; color: var(--fg); margin: 4px 0 0 0; font-family: ui-monospace, monospace; }
+  .tool-block.tool-error pre { border-color: var(--fail); }
+</style>
+</head>
+<body>
+<header class="top">
+  <h1>Instance AI — Pairwise Eval Report (${runs.length} run${runs.length === 1 ? '' : 's'})</h1>
+</header>
+<main>${body}</main>
+<script>
+  function mountWorkflows(scope) {
+    scope.querySelectorAll('.workflow-mount[data-workflow]').forEach((mount) => {
+      const json = mount.getAttribute('data-workflow');
+      const demo = document.createElement('n8n-demo');
+      demo.setAttribute('workflow', json);
+      demo.setAttribute('frame', 'true');
+      demo.setAttribute('clicktointeract', 'true');
+      demo.setAttribute('collapseformobile', 'true');
+      mount.replaceWith(demo);
+    });
+  }
+  document.querySelectorAll('details.example').forEach((el) => {
+    if (el.open) mountWorkflows(el);
+    el.addEventListener('toggle', () => {
+      if (el.open) mountWorkflows(el);
+    });
+  });
+</script>
+</body>
+</html>`;
+}
+
+// ---------------------------------------------------------------------------
+// CLI
+// ---------------------------------------------------------------------------
+
+interface ReportArgs {
+	outputRoot: string;
+	reportFile: string;
+}
+
+function parseArgs(argv: string[]): ReportArgs {
+	const get = (flag: string): string | undefined => {
+		const idx = argv.indexOf(flag);
+		if (idx === -1) return undefined;
+		const value = argv[idx + 1];
+		return value && !value.startsWith('--') ? value : undefined;
+	};
+
+	const defaultRoot = path.resolve(process.cwd(), '.output', 'pairwise');
+	const outputRoot = path.resolve(get('--output-root') ?? defaultRoot);
+	const reportFile = path.resolve(get('--report-file') ?? path.join(outputRoot, 'report.html'));
+	return { outputRoot, reportFile };
+}
+
+async function main(): Promise<void> {
+	const args = parseArgs(process.argv.slice(2));
+	const runs = await loadRuns(args.outputRoot);
+	if (runs.length === 0) {
+		console.error(`No runs found under ${args.outputRoot}`);
+		process.exit(1);
+	}
+	const html = renderDocument(runs);
+	await fs.writeFile(args.reportFile, html, 'utf8');
+	console.log(`Wrote ${runs.length} run(s) to ${args.reportFile}`);
+}
+
+if (require.main === module) {
+	main().catch((error) => {
+		console.error(error instanceof Error ? (error.stack ?? error.message) : String(error));
+		process.exit(1);
+	});
+}
--- a/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
+++ b/packages/@n8n/instance-ai/evaluations/clients/n8n-client.ts
@ -7,6 +7,7 @@
 // ---------------------------------------------------------------------------

 import type {
+	InstanceAiConfirmRequest,
 	InstanceAiRichMessagesResponse,
 	InstanceAiEvalExecutionResult,
 	InstanceAiEvalSubAgentRequest,
@ -121,16 +122,12 @@ export class N8nClient {
 	/**
 	 * Confirm or reject an action requested by the agent.
 	 * POST /rest/instance-ai/confirm/:requestId
-	 * body: { approved, mockCredentials?, credentialId?, ... }
+	 * body: kind-tagged `InstanceAiConfirmRequest` discriminated union.
 	 */
-	async confirmAction(
-		requestId: string,
-		approved: boolean,
-		options?: { mockCredentials?: boolean },
-	): Promise<void> {
+	async confirmAction(requestId: string, payload: InstanceAiConfirmRequest): Promise<void> {
 		await this.fetch(`/rest/instance-ai/confirm/${requestId}`, {
 			method: 'POST',
-			body: { approved, ...options },
+			body: payload,
 		});
 	}

--- a/Show More
+++ b/Show More