n8n/packages/@n8n/nodes-langchain/utils/agent-execution/buildSteps.ts

405 lines
13 KiB
TypeScript

import { AIMessage } from '@langchain/core/messages';
import { nodeNameToToolName } from 'n8n-workflow';
import type { EngineResponse, EngineResult, IDataObject } from 'n8n-workflow';
import type {
RequestResponseMetadata,
ToolCallData,
ThinkingContentBlock,
RedactedThinkingContentBlock,
ToolUseContentBlock,
} from './types';
/**
* Provider-specific metadata extracted from tool action metadata
*/
interface ProviderMetadata {
/** Gemini thought_signature for extended thinking */
thoughtSignature?: string;
/** Anthropic thinking content */
thinkingContent?: string;
/** Anthropic thinking type (thinking or redacted_thinking) */
thinkingType?: 'thinking' | 'redacted_thinking';
/** Anthropic thinking signature */
thinkingSignature?: string;
}
/**
* Extracts provider-specific metadata from tool action metadata.
* Validates and normalizes metadata from different LLM providers.
*
* @param metadata - The request/response metadata from tool action
* @returns Extracted and validated provider metadata
*/
function extractProviderMetadata(metadata?: RequestResponseMetadata): ProviderMetadata {
if (!metadata) return {};
// Extract Google/Gemini metadata
const thoughtSignature =
typeof metadata.google?.thoughtSignature === 'string'
? metadata.google.thoughtSignature
: undefined;
// Extract Anthropic metadata
const thinkingContent =
typeof metadata.anthropic?.thinkingContent === 'string'
? metadata.anthropic.thinkingContent
: undefined;
const thinkingType =
metadata.anthropic?.thinkingType === 'thinking' ||
metadata.anthropic?.thinkingType === 'redacted_thinking'
? metadata.anthropic.thinkingType
: undefined;
const thinkingSignature =
typeof metadata.anthropic?.thinkingSignature === 'string'
? metadata.anthropic.thinkingSignature
: undefined;
return {
thoughtSignature,
thinkingContent,
thinkingType,
thinkingSignature,
};
}
/**
* Builds Anthropic-specific content blocks for thinking mode.
* Creates an array with thinking block followed by tool_use block.
*
* IMPORTANT: The thinking block must come before tool_use in the message.
* When content is an array, LangChain ignores tool_calls field for Anthropic,
* so tool_use blocks must be in the content array.
*
* @param thinkingContent - The thinking content from Anthropic
* @param thinkingType - Type of thinking block (thinking or redacted_thinking)
* @param thinkingSignature - Optional signature for thinking block
* @param toolInput - The tool input data
* @param toolId - The tool call ID
* @param toolName - The tool name
* @returns Array of content blocks with thinking and tool_use
*/
function buildAnthropicContentBlocks(
thinkingContent: string,
thinkingType: 'thinking' | 'redacted_thinking',
thinkingSignature: string | undefined,
toolInput: IDataObject,
toolId: string,
toolName: string,
): Array<ThinkingContentBlock | RedactedThinkingContentBlock | ToolUseContentBlock> {
// Create thinking block with correct field names for Anthropic API
const thinkingBlock: ThinkingContentBlock | RedactedThinkingContentBlock =
thinkingType === 'thinking'
? {
type: 'thinking',
thinking: thinkingContent,
signature: thinkingSignature ?? '', // Use original signature if available
}
: {
type: 'redacted_thinking',
data: thinkingContent,
};
// Create tool_use block (required for Anthropic when using structured content)
const toolInputData = toolInput.input;
const toolUseBlock: ToolUseContentBlock = {
type: 'tool_use',
id: toolId,
name: toolName,
input:
toolInputData && typeof toolInputData === 'object'
? (toolInputData as Record<string, unknown>)
: {},
};
return [thinkingBlock, toolUseBlock];
}
/**
* Builds message content for AI message, handling provider-specific formats.
* For Anthropic thinking mode, creates content blocks with thinking and tool_use.
* For other providers, creates simple string content.
*
* @param providerMetadata - Provider-specific metadata
* @param toolInput - The tool input data
* @param toolId - The tool call ID
* @param toolName - The tool name
* @param nodeName - The node name for fallback string content
* @returns Message content (string or content blocks array)
*/
function buildMessageContent(
providerMetadata: ProviderMetadata,
toolInput: IDataObject,
toolId: string,
toolName: string,
): string | Array<ThinkingContentBlock | RedactedThinkingContentBlock | ToolUseContentBlock> {
const { thinkingContent, thinkingType, thinkingSignature } = providerMetadata;
// Anthropic thinking mode: build content blocks
if (thinkingContent && thinkingType) {
return buildAnthropicContentBlocks(
thinkingContent,
thinkingType,
thinkingSignature,
toolInput,
toolId,
toolName,
);
}
// Default: simple string content
return `Calling ${toolName} with input: ${JSON.stringify(toolInput)}`;
}
function resolveToolName(tool: EngineResult<RequestResponseMetadata>): string {
// For toolkit tools (e.g., MCP Client), the actual tool name is stored in input.tool
// by createEngineRequests. Using the node name (e.g., "MCP_Client") instead would
// cause the LLM to see a wrong tool name in history and attempt to re-call it,
// resulting in createEngineRequests returning an empty actions array and an infinite loop.
return (
tool.action.metadata?.hitl?.toolName ??
(typeof tool.action.input?.tool === 'string' ? tool.action.input.tool : undefined) ??
nodeNameToToolName(tool.action.nodeName)
);
}
/**
* Processed tool response data used during step building.
*/
interface ProcessedToolResponse {
tool: NonNullable<EngineResponse<RequestResponseMetadata>['actionResponses']>[number];
toolInput: IDataObject;
toolId: string;
toolName: string;
nodeName: string;
providerMetadata: ProviderMetadata;
}
/**
* Builds the additional_kwargs needed for Gemini thought signatures.
*
* The structure matches what @langchain/google-common expects:
* - `__gemini_function_call_thought_signatures__`: maps the first tool call ID to the signature
* - `tool_calls`: array of tool call descriptors
* - `signatures`: array aligned to parts [textPart, functionCall_1, ...], with the signature
* only on the first function call (per Google's docs)
*
* @param toolCalls - Tool calls to include
* @param thoughtSignature - The Gemini thought signature
* @returns additional_kwargs object for AIMessage
*/
function buildGeminiAdditionalKwargs(
toolCalls: Array<{ id: string; name: string; args: IDataObject }>,
thoughtSignature: string,
): Record<string, unknown> {
const signatures: string[] = ['', thoughtSignature];
for (let i = 2; i <= toolCalls.length; i++) {
signatures.push('');
}
return {
__gemini_function_call_thought_signatures__: {
[toolCalls[0].id]: thoughtSignature,
},
tool_calls: toolCalls.map((tc) => ({ id: tc.id, name: tc.name, args: tc.args })),
signatures,
};
}
/**
* Builds an AIMessage for a single tool call, handling provider-specific formats.
*
* For Anthropic thinking mode, content is an array of blocks (thinking + tool_use).
* For Gemini with thought signatures, additional_kwargs carries the signature.
* For other providers, content is a simple string with tool_calls set.
*/
function buildIndividualAIMessage(
toolId: string,
toolName: string,
toolInput: IDataObject,
providerMetadata: ProviderMetadata,
): AIMessage {
const toolCall = {
id: toolId,
name: toolName,
args: toolInput,
type: 'tool_call' as const,
};
const content = buildMessageContent(providerMetadata, toolInput, toolId, toolName);
return new AIMessage({
content,
// When content is an array (Anthropic thinking), LangChain ignores tool_calls
...(typeof content === 'string' && { tool_calls: [toolCall] }),
...(providerMetadata.thoughtSignature && {
additional_kwargs: buildGeminiAdditionalKwargs(
[{ id: toolId, name: toolName, args: toolInput }],
providerMetadata.thoughtSignature,
),
}),
});
}
/**
* Builds a shared AIMessage for parallel tool calls with Gemini thought signatures.
*
* For parallel function calls, Gemini requires ALL function calls in a single "model" turn
* with the thought_signature only on the first function call part. This matches the original
* model response structure and ensures the cryptographic signature validates correctly.
*
* LangChain's formatToToolMessages creates one ToolMessage per step. The @langchain/google-common
* package automatically merges consecutive "function" role messages, so the function responses
* will be correctly grouped.
*
* @param processedTools - Array of processed tool responses to include
* @param thoughtSignature - The shared thought signature from Gemini
* @returns AIMessage with all tool calls and proper signature format
*/
function buildSharedGeminiAIMessage(
processedTools: ProcessedToolResponse[],
thoughtSignature: string,
): AIMessage {
const allToolCalls = processedTools.map((pt) => ({
id: pt.toolId,
name: pt.toolName,
args: pt.toolInput,
type: 'tool_call' as const,
}));
const toolNames = processedTools.map((pt) => pt.nodeName).join(', ');
return new AIMessage({
content: `Calling tools: ${toolNames}`,
tool_calls: allToolCalls,
additional_kwargs: buildGeminiAdditionalKwargs(allToolCalls, thoughtSignature),
});
}
/**
* Builds the observation string from tool result data.
*/
function buildObservation(toolData: {
data?: { ai_tool?: Array<Array<{ json?: unknown }>> };
error?: { message?: string; name?: string };
}): string {
const aiToolData = toolData?.data?.ai_tool?.[0]?.map((item) => item?.json);
if (aiToolData && aiToolData.length > 0) {
return JSON.stringify(aiToolData);
}
if (toolData?.error) {
const errorInfo = {
error: toolData.error.message ?? 'Unknown error',
...(toolData.error.name && { errorType: toolData.error.name }),
};
return JSON.stringify(errorInfo);
}
return JSON.stringify('');
}
/**
* Rebuilds the agent steps from previous tool call responses.
* This is used to continue agent execution after tool calls have been made.
*
* For parallel tool calls with Gemini thought signatures, all function calls are grouped
* into a single AIMessage to match the original model response structure. This is required
* because the thought_signature is cryptographically tied to the combined parallel call turn.
*
* This is a generalized version that can be used across different agent types
* (Tools Agent, OpenAI Functions Agent, etc.).
*
* @param response - The engine response containing tool call results
* @param itemIndex - The current item index being processed
* @returns Array of tool call data representing the agent steps
*/
export function buildSteps(
response: EngineResponse<RequestResponseMetadata> | undefined,
itemIndex: number,
): ToolCallData[] {
const steps: ToolCallData[] = [];
if (!response) return steps;
const responses = response.actionResponses ?? [];
if (response.metadata?.previousRequests) {
steps.push(...response.metadata.previousRequests);
}
// First pass: collect all valid tool responses for this batch
const batchTools: ProcessedToolResponse[] = [];
for (const tool of responses) {
if (tool.action?.metadata?.itemIndex !== itemIndex) continue;
const toolInput: IDataObject = {
...tool.action.input,
id: tool.action.id,
};
if (!tool.data) continue;
const existingStep = steps.find((s) => s.action.toolCallId === toolInput.id);
if (existingStep) continue;
const providerMetadata = extractProviderMetadata(tool.action.metadata);
const toolId = typeof toolInput?.id === 'string' ? toolInput.id : 'reconstructed_call';
const toolName = resolveToolName(tool);
batchTools.push({
tool,
toolInput,
toolId,
toolName,
nodeName: tool.action.nodeName,
providerMetadata,
});
}
// Check if this batch has Gemini thought signatures and multiple parallel tool calls.
// If so, we must group them into a single AIMessage because:
// 1. The thought_signature is cryptographically tied to the combined parallel call turn
// 2. Splitting into separate model turns invalidates the signature
// 3. Google's API requires matching function response parts per function call turn
const sharedThoughtSignature = batchTools.find((bt) => bt.providerMetadata.thoughtSignature)
?.providerMetadata.thoughtSignature;
const sharedAIMessage =
sharedThoughtSignature && batchTools.length > 1
? buildSharedGeminiAIMessage(batchTools, sharedThoughtSignature)
: undefined;
// Second pass: build steps
for (let i = 0; i < batchTools.length; i++) {
const { tool, toolInput, toolId, toolName, nodeName, providerMetadata } = batchTools[i];
const observation = buildObservation(tool.data);
// Exclude metadata fields (id, log, type) from the tool input forwarded to the result
const { id, log, type, ...toolInputForResult } = toolInput;
// Parallel Gemini tool calls: first step gets the shared AIMessage,
// subsequent steps get empty messageLog. LangChain's formatToToolMessages
// will produce: [SharedAIMessage, ToolMsg_1, ToolMsg_2, ...]
const messageLog = sharedAIMessage
? i === 0
? [sharedAIMessage]
: []
: [buildIndividualAIMessage(toolId, toolName, toolInput, providerMetadata)];
steps.push({
action: {
tool: toolName,
toolInput: toolInputForResult,
log: toolInput.log || (messageLog[0]?.content ?? `Calling ${nodeName}`),
messageLog,
toolCallId: toolInput?.id,
type: toolInput.type || 'tool_call',
},
observation,
});
}
return steps;
}