fix(core): Avoid unnecessary planner credential prompts (#30451)

This commit is contained in:
Albert Alises 2026-05-14 17:36:11 +02:00 committed by GitHub
parent 94113f44a5
commit f1fd79f830
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 169 additions and 10 deletions

View File

@ -23,6 +23,7 @@ describe('loadDiscoveryTestCasesWithFiles', () => {
'screenshot-dashboard',
'http-node-config-no-browser',
'oauth-with-computer-use-disabled',
'planner-no-credential-ask',
]),
);
});
@ -33,6 +34,7 @@ describe('loadDiscoveryTestCasesWithFiles', () => {
'screenshot-dashboard',
'http-node-config-no-browser',
'oauth-with-computer-use-disabled',
'planner-no-credential-ask',
])('%s parses with a valid expectedToolInvocations rule', (slug) => {
const entry = cases.find((c) => c.fileSlug === slug);
expect(entry).toBeDefined();

View File

@ -0,0 +1,18 @@
{
"id": "planner-no-credential-ask",
"userMessage": "Build a workflow that lets customers request a meeting time in natural language, parses the preferred time with OpenAI, creates a Google Calendar event, and sends a Gmail confirmation email.",
"expectedToolInvocations": {
"anyOf": ["plan", "spawn_sub_agent:planner"],
"noneOfToolCalls": [
{
"toolName": "ask-user",
"argsContainAny": ["credential", "credentials", "account", "auth", "api key", "token"]
},
{
"toolName": "ask-user",
"argsContainAny": ["timezone", "time zone"]
}
]
},
"rationale": "Regression coverage for INS-204. A fresh workflow plan may use credentials(action=\"list\") for discovery, but planning must not ask the user which credential/account to use when the builder can auto-select or mock unresolved credentials. The planner should also use contextual timezone data rather than asking for it."
}

View File

@ -3,7 +3,7 @@ import { runExpectedToolsInvokedCheck } from '../expected-tools-invoked';
import type { DiscoveryTestCase } from '../types';
function makeOutcome(opts: {
toolCalls?: Array<Pick<CapturedToolCall, 'toolName'>>;
toolCalls?: Array<Pick<CapturedToolCall, 'toolName'> & Partial<Pick<CapturedToolCall, 'args'>>>;
agents?: Array<Pick<AgentActivity, 'role' | 'tools'>>;
}): EventOutcome {
return {
@ -14,7 +14,7 @@ function makeOutcome(opts: {
toolCalls: (opts.toolCalls ?? []).map((tc, i) => ({
toolCallId: `call-${i}`,
toolName: tc.toolName,
args: {},
args: tc.args ?? {},
durationMs: 0,
})),
agentActivities: (opts.agents ?? []).map((a, i) => ({
@ -175,6 +175,65 @@ describe('runExpectedToolsInvokedCheck', () => {
});
});
describe('noneOfToolCalls — actual tool-call guard', () => {
const plannerScenario: DiscoveryTestCase = {
id: 'test',
userMessage: 'Build a Gmail and Calendar workflow',
expectedToolInvocations: {
anyOf: ['plan', 'spawn_sub_agent:planner'],
noneOfToolCalls: [{ toolName: 'ask-user', argsContainAny: ['credential'] }],
},
};
it('passes when a spawned planner has ask-user available but does not call it', () => {
const result = runExpectedToolsInvokedCheck(
plannerScenario,
makeOutcome({
toolCalls: [{ toolName: 'plan' }],
agents: [{ role: 'planner', tools: ['credentials', 'ask-user'] }],
}),
);
expect(result.pass).toBe(true);
expect(result.invokedTools).toContain('ask-user');
});
it('fails when the forbidden tool call happens with matching args', () => {
const result = runExpectedToolsInvokedCheck(
plannerScenario,
makeOutcome({
toolCalls: [
{ toolName: 'plan' },
{
toolName: 'ask-user',
args: { question: 'Which Google Calendar credential should I use?' },
},
],
agents: [{ role: 'planner', tools: ['credentials', 'ask-user'] }],
}),
);
expect(result.pass).toBe(false);
expect(result.comment).toContain('Expected no actual tool call matching');
expect(result.comment).toContain('credential');
});
it('passes when the same tool is called for unrelated args', () => {
const result = runExpectedToolsInvokedCheck(
plannerScenario,
makeOutcome({
toolCalls: [
{ toolName: 'plan' },
{ toolName: 'ask-user', args: { question: 'Which failure branch should run?' } },
],
agents: [{ role: 'planner', tools: ['credentials', 'ask-user'] }],
}),
);
expect(result.pass).toBe(true);
});
});
describe('rule validation', () => {
it('throws when neither anyOf nor noneOf is provided', () => {
expect(() =>

View File

@ -18,7 +18,12 @@
// ---------------------------------------------------------------------------
import type { EventOutcome } from '../types';
import type { DiscoveryCheckResult, DiscoveryTestCase, ExpectedToolInvocations } from './types';
import type {
DiscoveryCheckResult,
DiscoveryTestCase,
ExpectedToolInvocations,
ForbiddenToolCall,
} from './types';
const SPAWN_PREFIX = 'spawn_sub_agent:';
@ -52,11 +57,35 @@ function matches(name: string, invokedTools: string[], spawnedAgents: string[]):
function validateRule(rule: ExpectedToolInvocations): void {
const hasAnyOf = Array.isArray(rule.anyOf) && rule.anyOf.length > 0;
const hasNoneOf = Array.isArray(rule.noneOf) && rule.noneOf.length > 0;
if (!hasAnyOf && !hasNoneOf) {
throw new Error('expectedToolInvocations must specify a non-empty `anyOf` or `noneOf` list');
const hasNoneOfToolCalls = Array.isArray(rule.noneOfToolCalls) && rule.noneOfToolCalls.length > 0;
if (!hasAnyOf && !hasNoneOf && !hasNoneOfToolCalls) {
throw new Error(
'expectedToolInvocations must specify a non-empty `anyOf`, `noneOf`, or `noneOfToolCalls` list',
);
}
}
function toolCallMatchesExpectation(
toolCall: EventOutcome['toolCalls'][number],
expectation: ForbiddenToolCall,
): boolean {
if (toolCall.toolName !== expectation.toolName) return false;
const argsContainAny = expectation.argsContainAny ?? [];
if (argsContainAny.length === 0) return true;
const argsText = JSON.stringify(toolCall.args).toLowerCase();
return argsContainAny.some((term) => argsText.includes(term.toLowerCase()));
}
function formatForbiddenToolCall(expectation: ForbiddenToolCall): string {
const args =
expectation.argsContainAny && expectation.argsContainAny.length > 0
? ` with args containing one of [${expectation.argsContainAny.join(', ')}]`
: '';
return `${expectation.toolName}${args}`;
}
export function runExpectedToolsInvokedCheck(
scenario: DiscoveryTestCase,
outcome: EventOutcome,
@ -66,7 +95,7 @@ export function runExpectedToolsInvokedCheck(
const invokedTools = collectInvokedTools(outcome);
const spawnedAgents = collectSpawnedAgents(outcome);
const { anyOf, noneOf } = scenario.expectedToolInvocations;
const { anyOf, noneOf, noneOfToolCalls } = scenario.expectedToolInvocations;
if (anyOf && anyOf.length > 0) {
const matched = anyOf.find((name) => matches(name, invokedTools, spawnedAgents));
@ -92,6 +121,22 @@ export function runExpectedToolsInvokedCheck(
}
}
if (noneOfToolCalls && noneOfToolCalls.length > 0) {
for (const expectation of noneOfToolCalls) {
const violated = outcome.toolCalls.find((toolCall) =>
toolCallMatchesExpectation(toolCall, expectation),
);
if (violated) {
return {
pass: false,
comment: `Expected no actual tool call matching [${formatForbiddenToolCall(expectation)}], but saw ${violated.toolName} with args ${JSON.stringify(violated.args)}.`,
invokedTools,
spawnedAgents,
};
}
}
}
return {
pass: true,
comment: 'Discovery expectation satisfied.',

View File

@ -17,13 +17,23 @@ import type { LocalGatewayStatus } from '../../src/types';
* (top-level orchestrator call, or via a spawned sub-agent's tool list).
* - `noneOf` pass only if NONE of the listed tool names was invoked.
* Used for negative scenarios that guard against over-eager invocation.
* - `noneOfToolCalls` pass only if NONE of the listed tool calls happened.
* Unlike `noneOf`, this checks actual tool calls only; a spawned sub-agent
* having the tool available does not count as a violation.
*
* Both forms accept a sub-agent role prefix `spawn_sub_agent:<role>` to match
* an `agent-spawned` event whose role equals `<role>`.
*/
export interface ForbiddenToolCall {
toolName: string;
/** Optional case-insensitive substrings to match against the serialized tool args. */
argsContainAny?: string[];
}
export interface ExpectedToolInvocations {
anyOf?: string[];
noneOf?: string[];
noneOfToolCalls?: ForbiddenToolCall[];
}
/**
@ -43,7 +53,7 @@ export interface DiscoveryTestCase {
userMessage: string;
/** Optional instance state overrides applied when constructing the agent. */
instanceState?: DiscoveryInstanceState;
/** Pass condition. Exactly one of the form keys (`anyOf` / `noneOf`) is required. */
/** Pass condition. At least one expectation key is required. */
expectedToolInvocations: ExpectedToolInvocations;
/** Free-form note explaining what regression this scenario protects against. */
rationale?: string;

View File

@ -33,14 +33,33 @@ describe('credential guardrail prompts', () => {
);
});
it('tells the planner to ask when a required service has more than one credential of the same type', () => {
it('tells the planner not to block planning on credential selection', () => {
expect(PLANNER_AGENT_PROMPT).toContain('Handle credentials without blocking planning');
expect(PLANNER_AGENT_PROMPT).toContain('If the user already named a credential');
expect(PLANNER_AGENT_PROMPT).toContain('If there is exactly one matching credential');
expect(PLANNER_AGENT_PROMPT).toContain('auto-select it, do not ask');
expect(PLANNER_AGENT_PROMPT).toContain('If there are no matching credentials, do not ask');
expect(PLANNER_AGENT_PROMPT).toContain('builder will use a mocked or unresolved credential');
expect(PLANNER_AGENT_PROMPT).toContain(
'Do ask when a required service has more than one credential of the same type',
'If there is more than one credential of the same required type',
);
expect(PLANNER_AGENT_PROMPT).toContain('ask once with a single-select');
expect(PLANNER_AGENT_PROMPT).toContain('cannot be discovered, only chosen');
expect(PLANNER_AGENT_PROMPT).toContain('credential-backed resource investigation');
expect(PLANNER_AGENT_PROMPT).toContain('Do not turn that into a credential-choice question');
expect(PLANNER_AGENT_PROMPT).toContain('Record the chosen credential name in `assumptions`');
});
it('tells the planner to use the contextual timezone before asking', () => {
expect(PLANNER_AGENT_PROMPT).toContain(
"Never ask for the user's timezone when `<user-timezone>` is present",
);
expect(PLANNER_AGENT_PROMPT).toContain('use `<current-datetime>` / `<user-timezone>`');
expect(PLANNER_AGENT_PROMPT).toContain(
'Only ask if timezone is missing and a date or schedule cannot be interpreted safely',
);
});
it('tells the builder to wrap ambiguous resource matches with placeholder()', () => {
// Both prompts inline PLACEHOLDERS_RULE, which now covers the multi-match case.
const sharedRule = '**Resource IDs with more than one candidate**';

View File

@ -19,10 +19,16 @@ ${SUBAGENT_OUTPUT_CONTRACT}
1. **Prefer assumptions over questions.** The user is waiting for a plan, and they can reject it if your assumptions are wrong so default to making reasonable choices rather than asking.
- **Never ask about things you can discover** call \`credentials(action="list")\`, \`data-tables(action="list")\`, \`templates(action="best-practices")\` instead.
- **Never ask about implementation details** trigger types, node choices, schedule times, column names. Pick sensible defaults.
- **Never ask for the user's timezone when \`<user-timezone>\` is present** — use \`<current-datetime>\` / \`<user-timezone>\`. Only ask if timezone is missing and a date or schedule cannot be interpreted safely.
- **Never default resource identifiers** the user didn't mention (Slack channels, calendars, spreadsheets, folders, etc.) leave them for the builder to resolve at build time.
- **Trust already-collected briefing context** if the briefing includes an Already-collected answers or Already-discovered resources section, treat those entries as authoritative. Do not ask again for purpose, trigger, integrations, schedule, model, resource, or credential choices already listed there.
- **Do ask when the answer would significantly change the plan** e.g. the user's goal is ambiguous ("build me a CRM" for sales? support? recruiting?), or a business rule must come from the user ("what should happen when payment fails?").
- **Do ask when a required service has more than one credential of the same type** (e.g. two \`openAiApi\` accounts, three Google Calendar accounts) — which one to use cannot be discovered, only chosen. Record the chosen credential name in \`assumptions\`.
- **Handle credentials without blocking planning.** Call \`credentials(action="list")\` for external services, then apply these cases:
- If the user already named a credential in their request, use it directly and record the credential name in \`assumptions\`.
- If there is exactly one matching credential for a required type, auto-select it, do not ask, and record the credential name in \`assumptions\`.
- If there are no matching credentials, do not ask; plan normally and note that the builder will use a mocked or unresolved credential and route setup after verification.
- If there is more than one credential of the same required type and the user did not name one, ask once with a single-select because the choice cannot be discovered, only chosen. Record the chosen credential name in \`assumptions\`.
- **Use credential-backed resource investigation only when it changes the plan.** You may call \`credentials(action="list")\` so a later resource lookup can validate a resource that affects the architecture (for example checking whether a named Slack channel exists). Do not turn that into a credential-choice question unless the multiple-credentials rule above applies.
- **List your assumptions** on your first \`add-plan-item\` call. The user reviews the plan before execution and can reject/correct.
2. **Discover** check what exists and learn best practices. Expect 36 tool calls for a typical request: