fix(core): Update instance-ai evaluator to include pinned subnodes and allow all mcp tools (#30292)

2026-05-12 16:10:30 +02:00 · 2026-05-12 11:13:01 +02:00 · 2026-05-12 11:13:01 +02:00 · 54d62bb4a1
commit 54d62bb4a1
parent a60ef7dbb5
3 changed files with 23 additions and 15 deletions
--- a/packages/@n8n/instance-ai/evaluations/cli/build-mcp-manifest.ts
+++ b/packages/@n8n/instance-ai/evaluations/cli/build-mcp-manifest.ts
@ -273,20 +273,8 @@ function sanitizeServerName(name: string): string {
 	return name.replace(/[^a-zA-Z0-9-]/g, '_');
 }

-const INSTANCE_MCP_TOOLS = [
-	'get_sdk_reference',
-	'search_nodes',
-	'get_suggested_nodes',
-	'get_node_types',
-	'validate_workflow',
-	'create_workflow_from_code',
-	'archive_workflow',
-	'update_workflow',
-] as const;
-
 function buildAllowedTools(serverName: string): readonly string[] {
-	const prefix = `mcp__${sanitizeServerName(serverName)}__`;
-	return INSTANCE_MCP_TOOLS.map((t) => `${prefix}${t}`);
+	return [`mcp__${sanitizeServerName(serverName)}`];
 }

 // ---------------------------------------------------------------------------
--- a/packages/@n8n/instance-ai/evaluations/harness/runner.ts
+++ b/packages/@n8n/instance-ai/evaluations/harness/runner.ts
@ -508,6 +508,25 @@ function buildVerificationArtifact(
 			sections.push(`- **${node.name ?? '(unnamed)'}** (${node.type}) — ${status}`);
 		}
 		sections.push('');
+		sections.push(
+			'**All node configs** (from saved workflow JSON, including nodes that did not run):',
+		);
+		sections.push(
+			'```json',
+			JSON.stringify(
+				wf.nodes.map((node) => ({
+					name: node.name ?? '(unnamed)',
+					type: node.type,
+					typeVersion: node.typeVersion,
+					...(node.disabled !== undefined ? { disabled: node.disabled } : {}),
+					parameters: node.parameters ?? {},
+				})),
+				null,
+				2,
+			),
+			'```',
+		);
+		sections.push('');
 		sections.push('**Connections:**');
 		sections.push('```json', JSON.stringify(wf.connections, null, 2), '```');
 		sections.push('');
--- a/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts
+++ b/packages/@n8n/instance-ai/evaluations/system-prompts/mock-execution-verify.ts
@ -5,10 +5,11 @@ export const MOCK_EXECUTION_VERIFY_PROMPT = `You are an expert evaluator for n8n
 This is a test environment. No real credentials or API connections exist. ALL HTTP calls are intercepted and answered by an LLM mock. This is by design — the purpose is to test the workflow structure and data flow without real services.

 - **Mocked nodes**: Made HTTP requests that were intercepted. An LLM generated the response. The node then processed the mock response using its real code. These nodes have NO real credentials — they use mock credentials that allow the node code to run but never reach real APIs.
- **Pinned nodes**: Trigger/start nodes whose output was generated by an LLM to simulate incoming data (webhooks, schedules). They didn't execute — their output was injected directly.
+- **Pinned nodes**: Nodes whose output was generated by an LLM and injected directly. This includes trigger/start nodes that simulate incoming data (webhooks, schedules), AI root nodes (Agent/Chain nodes), and protocol nodes that cannot be safely executed without real providers or credentials.
 - **Real nodes**: Logic nodes (Code, Set, Merge, Filter, Sort, IF, Switch) that executed their actual code on data from mocked/pinned upstream nodes.

 IMPORTANT: Nodes receiving mock responses instead of real API responses is EXPECTED. Missing or mock credentials is EXPECTED. Don't flag these as issues — they are the testing mechanism itself.
+IMPORTANT: When an AI root node such as an AI Agent is pinned, its connected AI subnodes (language model, memory, tools, retrievers, parsers) often do not run. This is expected. Evaluate those subnodes from the saved workflow structure, connections, and all-node configs instead of failing only because the subnode did not execute.

 Credential ID values in the workflow JSON (real, placeholder strings, or stale references) never cause execution failures. When a credential ID cannot be resolved, the framework substitutes a mock credential and execution proceeds. Do not cite credential ID values as a root cause of failure under any circumstance.

@ -18,7 +19,7 @@ The verification artifact contains:
 - **Pre-analysis**: Automated flags for known issues (builder config problems, mock generation failures)
 - **Execution summary**: Which nodes were mocked, pinned, or real
 - **Errors**: Any runtime errors from the execution
- **Workflow structure**: ALL nodes that were built, whether they executed or not, plus the full connections JSON showing how nodes are wired. Use this to verify node existence and wiring before making claims about missing nodes or wrong connections.
+- **Workflow structure**: ALL nodes that were built, whether they executed or not, the saved config for every node, plus the full connections JSON showing how nodes are wired. Use this to verify node existence, wiring, and configuration before making claims about missing nodes, wrong connections, or unverified parameters.
 - **Execution trace**: Per-node detail including HTTP requests sent, mock responses returned, and node output. Only includes nodes that actually ran. **IMPORTANT: The trace is NOT in chronological order.** Do not infer execution sequence from the order nodes appear in the trace. Use the connections JSON in the workflow structure to determine execution flow.
 - **Output truncation**: Each node's \`output\` array is capped at 10 items for artifact size. The full untruncated count is preserved in the node's \`outputCount\` field. **Do not treat a smaller \`output\` array as a bug.** If \`outputCount\` > 10, the node emitted more items than are shown — downstream nodes processed the full set. Only flag a count mismatch as a real issue when \`outputCount\` itself is inconsistent with what the mock returned or what the scenario requires.