diff --git a/packages/@n8n/instance-ai/evaluations/__tests__/data-discovery.test.ts b/packages/@n8n/instance-ai/evaluations/__tests__/data-discovery.test.ts index ee3c157ce99..44909975326 100644 --- a/packages/@n8n/instance-ai/evaluations/__tests__/data-discovery.test.ts +++ b/packages/@n8n/instance-ai/evaluations/__tests__/data-discovery.test.ts @@ -23,6 +23,7 @@ describe('loadDiscoveryTestCasesWithFiles', () => { 'screenshot-dashboard', 'http-node-config-no-browser', 'oauth-with-computer-use-disabled', + 'planner-no-credential-ask', ]), ); }); @@ -33,6 +34,7 @@ describe('loadDiscoveryTestCasesWithFiles', () => { 'screenshot-dashboard', 'http-node-config-no-browser', 'oauth-with-computer-use-disabled', + 'planner-no-credential-ask', ])('%s parses with a valid expectedToolInvocations rule', (slug) => { const entry = cases.find((c) => c.fileSlug === slug); expect(entry).toBeDefined(); diff --git a/packages/@n8n/instance-ai/evaluations/data/discovery/planner-no-credential-ask.json b/packages/@n8n/instance-ai/evaluations/data/discovery/planner-no-credential-ask.json new file mode 100644 index 00000000000..c761782215a --- /dev/null +++ b/packages/@n8n/instance-ai/evaluations/data/discovery/planner-no-credential-ask.json @@ -0,0 +1,18 @@ +{ + "id": "planner-no-credential-ask", + "userMessage": "Build a workflow that lets customers request a meeting time in natural language, parses the preferred time with OpenAI, creates a Google Calendar event, and sends a Gmail confirmation email.", + "expectedToolInvocations": { + "anyOf": ["plan", "spawn_sub_agent:planner"], + "noneOfToolCalls": [ + { + "toolName": "ask-user", + "argsContainAny": ["credential", "credentials", "account", "auth", "api key", "token"] + }, + { + "toolName": "ask-user", + "argsContainAny": ["timezone", "time zone"] + } + ] + }, + "rationale": "Regression coverage for INS-204. A fresh workflow plan may use credentials(action=\"list\") for discovery, but planning must not ask the user which credential/account to use when the builder can auto-select or mock unresolved credentials. The planner should also use contextual timezone data rather than asking for it." +} diff --git a/packages/@n8n/instance-ai/evaluations/discovery/__tests__/expected-tools-invoked.test.ts b/packages/@n8n/instance-ai/evaluations/discovery/__tests__/expected-tools-invoked.test.ts index 4dc04aa055d..120989a7112 100644 --- a/packages/@n8n/instance-ai/evaluations/discovery/__tests__/expected-tools-invoked.test.ts +++ b/packages/@n8n/instance-ai/evaluations/discovery/__tests__/expected-tools-invoked.test.ts @@ -3,7 +3,7 @@ import { runExpectedToolsInvokedCheck } from '../expected-tools-invoked'; import type { DiscoveryTestCase } from '../types'; function makeOutcome(opts: { - toolCalls?: Array>; + toolCalls?: Array & Partial>>; agents?: Array>; }): EventOutcome { return { @@ -14,7 +14,7 @@ function makeOutcome(opts: { toolCalls: (opts.toolCalls ?? []).map((tc, i) => ({ toolCallId: `call-${i}`, toolName: tc.toolName, - args: {}, + args: tc.args ?? {}, durationMs: 0, })), agentActivities: (opts.agents ?? []).map((a, i) => ({ @@ -175,6 +175,65 @@ describe('runExpectedToolsInvokedCheck', () => { }); }); + describe('noneOfToolCalls — actual tool-call guard', () => { + const plannerScenario: DiscoveryTestCase = { + id: 'test', + userMessage: 'Build a Gmail and Calendar workflow', + expectedToolInvocations: { + anyOf: ['plan', 'spawn_sub_agent:planner'], + noneOfToolCalls: [{ toolName: 'ask-user', argsContainAny: ['credential'] }], + }, + }; + + it('passes when a spawned planner has ask-user available but does not call it', () => { + const result = runExpectedToolsInvokedCheck( + plannerScenario, + makeOutcome({ + toolCalls: [{ toolName: 'plan' }], + agents: [{ role: 'planner', tools: ['credentials', 'ask-user'] }], + }), + ); + + expect(result.pass).toBe(true); + expect(result.invokedTools).toContain('ask-user'); + }); + + it('fails when the forbidden tool call happens with matching args', () => { + const result = runExpectedToolsInvokedCheck( + plannerScenario, + makeOutcome({ + toolCalls: [ + { toolName: 'plan' }, + { + toolName: 'ask-user', + args: { question: 'Which Google Calendar credential should I use?' }, + }, + ], + agents: [{ role: 'planner', tools: ['credentials', 'ask-user'] }], + }), + ); + + expect(result.pass).toBe(false); + expect(result.comment).toContain('Expected no actual tool call matching'); + expect(result.comment).toContain('credential'); + }); + + it('passes when the same tool is called for unrelated args', () => { + const result = runExpectedToolsInvokedCheck( + plannerScenario, + makeOutcome({ + toolCalls: [ + { toolName: 'plan' }, + { toolName: 'ask-user', args: { question: 'Which failure branch should run?' } }, + ], + agents: [{ role: 'planner', tools: ['credentials', 'ask-user'] }], + }), + ); + + expect(result.pass).toBe(true); + }); + }); + describe('rule validation', () => { it('throws when neither anyOf nor noneOf is provided', () => { expect(() => diff --git a/packages/@n8n/instance-ai/evaluations/discovery/expected-tools-invoked.ts b/packages/@n8n/instance-ai/evaluations/discovery/expected-tools-invoked.ts index e81db7ee646..639cefd29ab 100644 --- a/packages/@n8n/instance-ai/evaluations/discovery/expected-tools-invoked.ts +++ b/packages/@n8n/instance-ai/evaluations/discovery/expected-tools-invoked.ts @@ -18,7 +18,12 @@ // --------------------------------------------------------------------------- import type { EventOutcome } from '../types'; -import type { DiscoveryCheckResult, DiscoveryTestCase, ExpectedToolInvocations } from './types'; +import type { + DiscoveryCheckResult, + DiscoveryTestCase, + ExpectedToolInvocations, + ForbiddenToolCall, +} from './types'; const SPAWN_PREFIX = 'spawn_sub_agent:'; @@ -52,11 +57,35 @@ function matches(name: string, invokedTools: string[], spawnedAgents: string[]): function validateRule(rule: ExpectedToolInvocations): void { const hasAnyOf = Array.isArray(rule.anyOf) && rule.anyOf.length > 0; const hasNoneOf = Array.isArray(rule.noneOf) && rule.noneOf.length > 0; - if (!hasAnyOf && !hasNoneOf) { - throw new Error('expectedToolInvocations must specify a non-empty `anyOf` or `noneOf` list'); + const hasNoneOfToolCalls = Array.isArray(rule.noneOfToolCalls) && rule.noneOfToolCalls.length > 0; + if (!hasAnyOf && !hasNoneOf && !hasNoneOfToolCalls) { + throw new Error( + 'expectedToolInvocations must specify a non-empty `anyOf`, `noneOf`, or `noneOfToolCalls` list', + ); } } +function toolCallMatchesExpectation( + toolCall: EventOutcome['toolCalls'][number], + expectation: ForbiddenToolCall, +): boolean { + if (toolCall.toolName !== expectation.toolName) return false; + + const argsContainAny = expectation.argsContainAny ?? []; + if (argsContainAny.length === 0) return true; + + const argsText = JSON.stringify(toolCall.args).toLowerCase(); + return argsContainAny.some((term) => argsText.includes(term.toLowerCase())); +} + +function formatForbiddenToolCall(expectation: ForbiddenToolCall): string { + const args = + expectation.argsContainAny && expectation.argsContainAny.length > 0 + ? ` with args containing one of [${expectation.argsContainAny.join(', ')}]` + : ''; + return `${expectation.toolName}${args}`; +} + export function runExpectedToolsInvokedCheck( scenario: DiscoveryTestCase, outcome: EventOutcome, @@ -66,7 +95,7 @@ export function runExpectedToolsInvokedCheck( const invokedTools = collectInvokedTools(outcome); const spawnedAgents = collectSpawnedAgents(outcome); - const { anyOf, noneOf } = scenario.expectedToolInvocations; + const { anyOf, noneOf, noneOfToolCalls } = scenario.expectedToolInvocations; if (anyOf && anyOf.length > 0) { const matched = anyOf.find((name) => matches(name, invokedTools, spawnedAgents)); @@ -92,6 +121,22 @@ export function runExpectedToolsInvokedCheck( } } + if (noneOfToolCalls && noneOfToolCalls.length > 0) { + for (const expectation of noneOfToolCalls) { + const violated = outcome.toolCalls.find((toolCall) => + toolCallMatchesExpectation(toolCall, expectation), + ); + if (violated) { + return { + pass: false, + comment: `Expected no actual tool call matching [${formatForbiddenToolCall(expectation)}], but saw ${violated.toolName} with args ${JSON.stringify(violated.args)}.`, + invokedTools, + spawnedAgents, + }; + } + } + } + return { pass: true, comment: 'Discovery expectation satisfied.', diff --git a/packages/@n8n/instance-ai/evaluations/discovery/types.ts b/packages/@n8n/instance-ai/evaluations/discovery/types.ts index a24c4fb6358..b20047e7a28 100644 --- a/packages/@n8n/instance-ai/evaluations/discovery/types.ts +++ b/packages/@n8n/instance-ai/evaluations/discovery/types.ts @@ -17,13 +17,23 @@ import type { LocalGatewayStatus } from '../../src/types'; * (top-level orchestrator call, or via a spawned sub-agent's tool list). * - `noneOf` — pass only if NONE of the listed tool names was invoked. * Used for negative scenarios that guard against over-eager invocation. + * - `noneOfToolCalls` — pass only if NONE of the listed tool calls happened. + * Unlike `noneOf`, this checks actual tool calls only; a spawned sub-agent + * having the tool available does not count as a violation. * * Both forms accept a sub-agent role prefix `spawn_sub_agent:` to match * an `agent-spawned` event whose role equals ``. */ +export interface ForbiddenToolCall { + toolName: string; + /** Optional case-insensitive substrings to match against the serialized tool args. */ + argsContainAny?: string[]; +} + export interface ExpectedToolInvocations { anyOf?: string[]; noneOf?: string[]; + noneOfToolCalls?: ForbiddenToolCall[]; } /** @@ -43,7 +53,7 @@ export interface DiscoveryTestCase { userMessage: string; /** Optional instance state overrides applied when constructing the agent. */ instanceState?: DiscoveryInstanceState; - /** Pass condition. Exactly one of the form keys (`anyOf` / `noneOf`) is required. */ + /** Pass condition. At least one expectation key is required. */ expectedToolInvocations: ExpectedToolInvocations; /** Free-form note explaining what regression this scenario protects against. */ rationale?: string; diff --git a/packages/@n8n/instance-ai/src/tools/orchestration/__tests__/credential-guardrails.prompt.test.ts b/packages/@n8n/instance-ai/src/tools/orchestration/__tests__/credential-guardrails.prompt.test.ts index 1264b68f91d..815a39d0769 100644 --- a/packages/@n8n/instance-ai/src/tools/orchestration/__tests__/credential-guardrails.prompt.test.ts +++ b/packages/@n8n/instance-ai/src/tools/orchestration/__tests__/credential-guardrails.prompt.test.ts @@ -33,14 +33,33 @@ describe('credential guardrail prompts', () => { ); }); - it('tells the planner to ask when a required service has more than one credential of the same type', () => { + it('tells the planner not to block planning on credential selection', () => { + expect(PLANNER_AGENT_PROMPT).toContain('Handle credentials without blocking planning'); + expect(PLANNER_AGENT_PROMPT).toContain('If the user already named a credential'); + expect(PLANNER_AGENT_PROMPT).toContain('If there is exactly one matching credential'); + expect(PLANNER_AGENT_PROMPT).toContain('auto-select it, do not ask'); + expect(PLANNER_AGENT_PROMPT).toContain('If there are no matching credentials, do not ask'); + expect(PLANNER_AGENT_PROMPT).toContain('builder will use a mocked or unresolved credential'); expect(PLANNER_AGENT_PROMPT).toContain( - 'Do ask when a required service has more than one credential of the same type', + 'If there is more than one credential of the same required type', ); + expect(PLANNER_AGENT_PROMPT).toContain('ask once with a single-select'); expect(PLANNER_AGENT_PROMPT).toContain('cannot be discovered, only chosen'); + expect(PLANNER_AGENT_PROMPT).toContain('credential-backed resource investigation'); + expect(PLANNER_AGENT_PROMPT).toContain('Do not turn that into a credential-choice question'); expect(PLANNER_AGENT_PROMPT).toContain('Record the chosen credential name in `assumptions`'); }); + it('tells the planner to use the contextual timezone before asking', () => { + expect(PLANNER_AGENT_PROMPT).toContain( + "Never ask for the user's timezone when `` is present", + ); + expect(PLANNER_AGENT_PROMPT).toContain('use `` / ``'); + expect(PLANNER_AGENT_PROMPT).toContain( + 'Only ask if timezone is missing and a date or schedule cannot be interpreted safely', + ); + }); + it('tells the builder to wrap ambiguous resource matches with placeholder()', () => { // Both prompts inline PLACEHOLDERS_RULE, which now covers the multi-match case. const sharedRule = '**Resource IDs with more than one candidate**'; diff --git a/packages/@n8n/instance-ai/src/tools/orchestration/plan-agent-prompt.ts b/packages/@n8n/instance-ai/src/tools/orchestration/plan-agent-prompt.ts index 0bb2f2d9a74..5565ec9bbd2 100644 --- a/packages/@n8n/instance-ai/src/tools/orchestration/plan-agent-prompt.ts +++ b/packages/@n8n/instance-ai/src/tools/orchestration/plan-agent-prompt.ts @@ -19,10 +19,16 @@ ${SUBAGENT_OUTPUT_CONTRACT} 1. **Prefer assumptions over questions.** The user is waiting for a plan, and they can reject it if your assumptions are wrong — so default to making reasonable choices rather than asking. - **Never ask about things you can discover** — call \`credentials(action="list")\`, \`data-tables(action="list")\`, \`templates(action="best-practices")\` instead. - **Never ask about implementation details** — trigger types, node choices, schedule times, column names. Pick sensible defaults. + - **Never ask for the user's timezone when \`\` is present** — use \`\` / \`\`. Only ask if timezone is missing and a date or schedule cannot be interpreted safely. - **Never default resource identifiers** the user didn't mention (Slack channels, calendars, spreadsheets, folders, etc.) — leave them for the builder to resolve at build time. - **Trust already-collected briefing context** — if the briefing includes an Already-collected answers or Already-discovered resources section, treat those entries as authoritative. Do not ask again for purpose, trigger, integrations, schedule, model, resource, or credential choices already listed there. - **Do ask when the answer would significantly change the plan** — e.g. the user's goal is ambiguous ("build me a CRM" — for sales? support? recruiting?), or a business rule must come from the user ("what should happen when payment fails?"). - - **Do ask when a required service has more than one credential of the same type** (e.g. two \`openAiApi\` accounts, three Google Calendar accounts) — which one to use cannot be discovered, only chosen. Record the chosen credential name in \`assumptions\`. + - **Handle credentials without blocking planning.** Call \`credentials(action="list")\` for external services, then apply these cases: + - If the user already named a credential in their request, use it directly and record the credential name in \`assumptions\`. + - If there is exactly one matching credential for a required type, auto-select it, do not ask, and record the credential name in \`assumptions\`. + - If there are no matching credentials, do not ask; plan normally and note that the builder will use a mocked or unresolved credential and route setup after verification. + - If there is more than one credential of the same required type and the user did not name one, ask once with a single-select because the choice cannot be discovered, only chosen. Record the chosen credential name in \`assumptions\`. + - **Use credential-backed resource investigation only when it changes the plan.** You may call \`credentials(action="list")\` so a later resource lookup can validate a resource that affects the architecture (for example checking whether a named Slack channel exists). Do not turn that into a credential-choice question unless the multiple-credentials rule above applies. - **List your assumptions** on your first \`add-plan-item\` call. The user reviews the plan before execution and can reject/correct. 2. **Discover** — check what exists and learn best practices. Expect 3–6 tool calls for a typical request: