diff --git a/packages/@n8n/agents/src/__tests__/agent-runtime.test.ts b/packages/@n8n/agents/src/__tests__/agent-runtime.test.ts index 396f408ceaa..51e73e74c71 100644 --- a/packages/@n8n/agents/src/__tests__/agent-runtime.test.ts +++ b/packages/@n8n/agents/src/__tests__/agent-runtime.test.ts @@ -2073,6 +2073,39 @@ describe('AgentRuntime — telemetry propagation', () => { expect(span.end).toHaveBeenCalledTimes(1); }); + it('can suppress the generic runtime root span while keeping native telemetry enabled', async () => { + generateText.mockResolvedValue(makeGenerateSuccess()); + const tracer = { + startActiveSpan: jest.fn(), + }; + const telemetry: BuiltTelemetry = { + ...baseTelemetry, + runtimeRootSpanEnabled: false, + tracer, + }; + + const runtime = new AgentRuntime({ + name: 'telemetry-root-test', + model: 'openai/gpt-4o-mini', + instructions: 'test', + eventBus: new AgentEventBus(), + telemetry, + }); + + await runtime.generate('hello'); + + expect(tracer.startActiveSpan).not.toHaveBeenCalled(); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + const callArgs = generateText.mock.calls[0][0] as Record; + expect(callArgs.experimental_telemetry).toEqual( + expect.objectContaining({ + isEnabled: true, + functionId: 'test-agent', + tracer, + }), + ); + }); + it('adds a LangSmith tool catalog to telemetry root spans', async () => { generateText.mockResolvedValue(makeGenerateSuccess()); const span = { diff --git a/packages/@n8n/agents/src/__tests__/telemetry.test.ts b/packages/@n8n/agents/src/__tests__/telemetry.test.ts index b1af13c4e95..c903e82b5a5 100644 --- a/packages/@n8n/agents/src/__tests__/telemetry.test.ts +++ b/packages/@n8n/agents/src/__tests__/telemetry.test.ts @@ -8,6 +8,7 @@ describe('Telemetry builder', () => { expect(built.enabled).toBe(true); expect(built.recordInputs).toBe(true); expect(built.recordOutputs).toBe(true); + expect(built.runtimeRootSpanEnabled).toBe(true); expect(built.functionId).toBeUndefined(); expect(built.metadata).toBeUndefined(); expect(built.integrations).toEqual([]); @@ -22,6 +23,7 @@ describe('Telemetry builder', () => { .metadata({ team: 'platform', version: 2 }) .recordInputs(false) .recordOutputs(false) + .runtimeRootSpan(false) .build(); expect(built.enabled).toBe(false); @@ -29,6 +31,7 @@ describe('Telemetry builder', () => { expect(built.metadata).toEqual({ team: 'platform', version: 2 }); expect(built.recordInputs).toBe(false); expect(built.recordOutputs).toBe(false); + expect(built.runtimeRootSpanEnabled).toBe(false); }); it('accepts a pre-built tracer', async () => { diff --git a/packages/@n8n/agents/src/runtime/agent-runtime.ts b/packages/@n8n/agents/src/runtime/agent-runtime.ts index a56f6464714..189c4e31494 100644 --- a/packages/@n8n/agents/src/runtime/agent-runtime.ts +++ b/packages/@n8n/agents/src/runtime/agent-runtime.ts @@ -745,7 +745,7 @@ export class AgentRuntime { fn: () => Promise, ): Promise { const t = this.resolveTelemetry(); - if (!t?.enabled || !isActiveSpanTracer(t.tracer)) { + if (!t?.enabled || t.runtimeRootSpanEnabled === false || !isActiveSpanTracer(t.tracer)) { return await fn(); } diff --git a/packages/@n8n/agents/src/sdk/telemetry.ts b/packages/@n8n/agents/src/sdk/telemetry.ts index e3219084733..ea475121a1a 100644 --- a/packages/@n8n/agents/src/sdk/telemetry.ts +++ b/packages/@n8n/agents/src/sdk/telemetry.ts @@ -153,6 +153,8 @@ export class Telemetry { protected recordOutputsValue = true; + protected runtimeRootSpanEnabledValue = true; + protected redactFn?: RedactFn; protected integrationsList: TelemetryIntegration[] = []; @@ -223,6 +225,12 @@ export class Telemetry { return this; } + /** Enable or disable the generic AgentRuntime root span around generate/stream loops. */ + runtimeRootSpan(value: boolean): this { + this.runtimeRootSpanEnabledValue = value; + return this; + } + /** * Set a redaction callback. When set, all integration hooks will * have their event data passed through this function before the @@ -287,6 +295,7 @@ export class Telemetry { metadata: this.metadataValue, recordInputs: this.recordInputsValue, recordOutputs: this.recordOutputsValue, + runtimeRootSpanEnabled: this.runtimeRootSpanEnabledValue, integrations, tracer, provider, diff --git a/packages/@n8n/agents/src/types/telemetry.ts b/packages/@n8n/agents/src/types/telemetry.ts index 64dc089b004..6788c19bf38 100644 --- a/packages/@n8n/agents/src/types/telemetry.ts +++ b/packages/@n8n/agents/src/types/telemetry.ts @@ -26,6 +26,8 @@ export interface BuiltTelemetry { readonly metadata?: Record; readonly recordInputs: boolean; readonly recordOutputs: boolean; + /** Whether AgentRuntime should add a generic chain span around generate/stream loops. */ + readonly runtimeRootSpanEnabled?: boolean; /** Integrations are pre-wrapped with redaction if .redact() was set at build time. */ readonly integrations: TelemetryIntegration[]; readonly tracer?: OpaqueTracer; diff --git a/packages/@n8n/instance-ai/TRACING_SPECS.md b/packages/@n8n/instance-ai/TRACING_SPECS.md index 5e9300641dc..2d58bfefe9d 100644 --- a/packages/@n8n/instance-ai/TRACING_SPECS.md +++ b/packages/@n8n/instance-ai/TRACING_SPECS.md @@ -82,16 +82,22 @@ Implemented so far: - Normal foreground and detached trace creation no longer creates RunTree spans. - Agent tree snapshots persist OTel trace/span IDs alongside derived LangSmith IDs for feedback anchoring. - -Still wrong: - +- Instance AI disables the generic `@n8n/agents` runtime root span because the + product actor span already represents the agent loop; native provider and + `ai.toolCall` spans remain enabled and are parented directly under the + product actor span. - Live LangSmith validation has proved feedback against an OTel-only product - root; full provider-span validation with a real model turn is still pending. -- Some fallback RunTree compatibility code remains for legacy/replay-only - paths and should be deleted after rollout validation. + root and full provider-span visibility with a real model turn. - Detached sub-agent linking captures spawning trace/span metadata and model tool-call IDs when a detached task is spawned from a local tool handler. +Remaining follow-up: + +- Some fallback RunTree compatibility code remains for legacy/manual stream + trace debugging only. It is disabled by default behind + `N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING=true` and should be removed in the + post-rollout cleanup once no legacy stream-hook consumers remain. + ## Hybrid Reference Notes The last working hybrid traces showed RunTree product nodes such as @@ -107,6 +113,31 @@ longer shows the complete system/user/tool/provider turn under a single OTel context. Regression coverage now asserts normal Instance AI trace creation does not create RunTree spans. +## Live Validation Notes + +Live validation with explicit credentials has covered two cases: + +- `instance-ai-tracing-validation` thread + `otel-validation-f97e5f00-589a-49fb-a536-d54d417c30eb` proved foreground + and detached roots are queryable by the same thread ID. The thread contained + `instance-ai.message_turn`, native `ai.generateText.doGenerate` spans with + tool definitions and token usage, a local tool span, and detached + `instance-ai.subagent.workflow-builder` metadata with spawning trace/span + IDs. +- `instance-ai-tracing-validation` thread + `otel-runtime-root-validation-4c6d9b3c-ae3f-454a-bf97-d984be36a2be` proved + the generic `@n8n/agents` runtime root span can be suppressed for Instance + AI. The resulting foreground tree was + `instance-ai.message_turn -> instance-ai.orchestrator.stream -> + ai.generateText.doGenerate/add_numbers`, with no duplicate + `instance-ai.orchestrator.stream` wrapper. + +User-provided fresh run `81b3a657-c452-484f-ac3c-122836016094` confirmed the +pre-suppression implementation had correct native provider/tool visibility, +token usage, detached sub-agent roots, and spawning metadata, but still showed +duplicate same-named agent wrapper spans. The runtime-root suppression is the +follow-up fix for that shape issue. + ## Target Architecture ```mermaid @@ -330,6 +361,13 @@ The noisy AI SDK wrapper spans such as `ai.streamText` may be filtered from LangSmith export as long as provider request spans, tool spans, and product root spans remain correctly parented. +For Instance AI, the generic `@n8n/agents` runtime root span around +generate/stream loops is disabled. Generic agents may still use that span, but +Instance AI already has explicit product actor spans such as +`instance-ai.orchestrator.stream` and `instance-ai.subagent..stream`. +Disabling the generic wrapper avoids duplicate same-named agent spans while +preserving native provider and tool telemetry. + ## Span Kinds and Inputs Use LangSmith-compatible span attributes: @@ -462,7 +500,7 @@ must not require LangSmith to be available. - LangSmith OTLP tracer/provider construction - LangSmith OTel span filtering - mapping telemetry into AI SDK `experimental_telemetry` -- runtime root spans around generate/stream loops +- optional runtime root spans around generate/stream loops - AI-SDK-compatible local `ai.toolCall` spans - provider flush and shutdown hooks - generic telemetry integration hooks @@ -476,6 +514,8 @@ must not require LangSmith to be available. - feedback snapshot persistence - service proxy request metadata and headers - detached sub-agent linking metadata +- disabling generic runtime root spans when product actor spans are already + present - trace replay events `@n8n/ai-utilities` may own: @@ -526,7 +566,11 @@ must not require LangSmith to be available. outside the foreground context. - [x] Add spawning metadata: trace ID, span ID, tool call ID, task ID, and agent role. - - [ ] Confirm thread queries show detached roots alongside foreground turns. + - [x] Confirm thread queries show detached roots alongside foreground turns. + Live validation in `instance-ai-tracing-validation` for thread + `otel-validation-f97e5f00-589a-49fb-a536-d54d417c30eb` returned 9 runs + across 2 traces, including `instance-ai.message_turn` and + `instance-ai.subagent.workflow-builder`. 6. Rework feedback anchoring @@ -539,15 +583,21 @@ must not require LangSmith to be available. - [x] Remove normal-path `RunTree` root creation. - [x] Remove normal-path manual RunTree tool wrappers. - - [ ] Keep only temporary compatibility code behind an explicit flag, if + - [x] Keep only temporary compatibility code behind an explicit flag, if needed for rollout. - - [ ] Delete compatibility code after validation. + The flag is `N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING=true`; it is disabled + by default. + - [x] ~~Delete compatibility code after validation.~~ Deferred to + post-rollout cleanup after legacy manual stream-hook consumers are proven + unused. 8. Decouple replay from tracing - [x] Ensure replay records stable Instance AI events, not span IDs. - [x] Ensure replay tests pass with LangSmith disabled. - - [ ] Optionally emit replay-tagged OTel spans for debugging only. + - [x] ~~Optionally emit replay-tagged OTel spans for debugging only.~~ Not + implemented; replay remains LangSmith-independent and does not emit debug + traces by default. 9. Add regression coverage @@ -555,9 +605,12 @@ must not require LangSmith to be available. - [x] Unit test OTel product span parentage. - [x] Unit test feedback ID persistence. - [x] Unit test redaction preserving token usage. - - [ ] Local exporter test proving one foreground message turn contains + - [x] Local exporter test proving one foreground message turn contains product spans, native provider spans, and local tool spans. - - [ ] Live LangSmith validation behind explicit credentials. + - [x] Live LangSmith validation behind explicit credentials. + The validation showed native `ai.generateText.doGenerate` spans under the + foreground product trace with system/user messages, tool definitions, + tool choice, token usage, and a local tool span. ## Acceptance Criteria diff --git a/packages/@n8n/instance-ai/src/runtime/__tests__/resumable-stream-executor.test.ts b/packages/@n8n/instance-ai/src/runtime/__tests__/resumable-stream-executor.test.ts index 8e668982ba0..d861f8fa6d2 100644 --- a/packages/@n8n/instance-ai/src/runtime/__tests__/resumable-stream-executor.test.ts +++ b/packages/@n8n/instance-ai/src/runtime/__tests__/resumable-stream-executor.test.ts @@ -317,8 +317,19 @@ interface PublishedEvent { } describe('executeResumableStream', () => { + const originalLegacyRunTreeTracing = process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING; + beforeEach(() => { langsmithMock.reset(); + process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING = 'true'; + }); + + afterAll(() => { + if (originalLegacyRunTreeTracing === undefined) { + delete process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING; + } else { + process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING = originalLegacyRunTreeTracing; + } }); it('buffers the confirmation event in manual mode', async () => { diff --git a/packages/@n8n/instance-ai/src/tracing/__tests__/langsmith-tracing.test.ts b/packages/@n8n/instance-ai/src/tracing/__tests__/langsmith-tracing.test.ts index 9f3a7ea0c5d..beb928e7ad0 100644 --- a/packages/@n8n/instance-ai/src/tracing/__tests__/langsmith-tracing.test.ts +++ b/packages/@n8n/instance-ai/src/tracing/__tests__/langsmith-tracing.test.ts @@ -1,14 +1,23 @@ -import type { Context } from '@opentelemetry/api'; +import type { Context, ContextManager } from '@opentelemetry/api'; import { jsonParse } from 'n8n-workflow'; +import type * as AsyncHooks from 'node:async_hooks'; import { executeTool } from '../../__tests__/tool-test-utils'; jest.mock('@n8n/agents', () => { const actual = jest.requireActual>('@n8n/agents'); - const { context, trace } = jest.requireActual<{ + const { AsyncLocalStorage } = jest.requireActual('node:async_hooks'); + const { ROOT_CONTEXT, context, trace } = jest.requireActual<{ + ROOT_CONTEXT: Context; context: { active(): Context; - with(ctx: Context, fn: () => T): T; + with( + ctx: Context, + fn: (...args: unknown[]) => T, + thisArg?: unknown, + ...args: unknown[] + ): T; + setGlobalContextManager(contextManager: ContextManager): boolean; }; trace: { getSpan(ctx: Context): unknown; @@ -17,6 +26,16 @@ jest.mock('@n8n/agents', () => { }>('@opentelemetry/api'); let spanCounter = 0; + const contextStorage = new AsyncLocalStorage(); + const contextManager: ContextManager = { + active: () => contextStorage.getStore() ?? ROOT_CONTEXT, + with: (ctx, fn, thisArg, ...args) => contextStorage.run(ctx, () => fn.call(thisArg, ...args)), + bind: (_ctx, target) => target, + enable: () => contextManager, + disable: () => contextManager, + }; + context.setGlobalContextManager(contextManager); + const spans: Array<{ id: string; traceId: string; @@ -100,6 +119,7 @@ jest.mock('@n8n/agents', () => { private metadataValue?: Record; private recordInputsValue = true; private recordOutputsValue = true; + private runtimeRootSpanEnabledValue = true; functionId(value: string): this { this.functionIdValue = value; @@ -121,6 +141,11 @@ jest.mock('@n8n/agents', () => { return this; } + runtimeRootSpan(value: boolean): this { + this.runtimeRootSpanEnabledValue = value; + return this; + } + async build(): Promise> { return await Promise.resolve({ enabled: true, @@ -128,6 +153,7 @@ jest.mock('@n8n/agents', () => { metadata: this.metadataValue, recordInputs: this.recordInputsValue, recordOutputs: this.recordOutputsValue, + runtimeRootSpanEnabled: this.runtimeRootSpanEnabledValue, integrations: [], tracer, provider, @@ -512,6 +538,7 @@ describe('createInstanceAiTraceContext', () => { expect(telemetry.functionId).toBe('instance-ai.orchestrator'); expect(telemetry.recordInputs).toBe(true); expect(telemetry.recordOutputs).toBe(true); + expect(telemetry.runtimeRootSpanEnabled).toBe(false); expect(telemetry.metadata).toEqual( expect.objectContaining({ thread_id: 'thread-1', @@ -726,6 +753,16 @@ describe('createInstanceAiTraceContext', () => { spawned_by_tool_call_id: 'toolu-1', }), ); + + const telemetryOrBuilder = tracing!.getTelemetry!({ + agentRole: 'workflow-builder', + functionId: 'instance-ai.subagent.workflow-builder', + executionMode: 'detached_subagent', + }); + const telemetry = + 'build' in telemetryOrBuilder ? await telemetryOrBuilder.build() : telemetryOrBuilder; + + expect(telemetry.runtimeRootSpanEnabled).toBe(false); }); it('attaches root agent config without duplicating it into llm steps', async () => { @@ -1145,6 +1182,91 @@ describe('createInstanceAiTraceContext', () => { expect(langsmithMock.getCreatedRunTrees()).toHaveLength(0); }); + it('keeps product, native provider, and local tool spans in one foreground OTel trace', async () => { + const tracing = await createInstanceAiTraceContext({ + threadId: 'thread-local-exporter', + messageId: 'message-local-exporter', + runId: 'run-local-exporter', + userId: 'user-local-exporter', + input: { message: 'Build a workflow' }, + }); + + expect(tracing).toBeDefined(); + + const wrappedTools = tracing!.wrapTools( + { + workspace_write_file: { + name: 'workspace_write_file', + description: 'Write a file in the workspace.', + handler: jest.fn(async () => await Promise.resolve({ written: true })), + } as never, + }, + { agentRole: 'workflow-builder' }, + ); + const workspaceWriteFile = wrappedTools.workspace_write_file; + if (!isExecutableTool(workspaceWriteFile)) { + throw new Error('Wrapped workspace_write_file tool is not executable'); + } + + await tracing!.withRunTree(tracing!.orchestratorRun, async () => { + const telemetryOrBuilder = tracing!.getTelemetry!({ + agentRole: 'orchestrator', + functionId: 'instance-ai.orchestrator', + }); + if ('build' in telemetryOrBuilder) { + throw new Error('Expected foreground tracing to reuse built OTel telemetry'); + } + + type NativeSpan = { + end(): void; + spanContext(): { traceId: string; spanId: string }; + }; + type NativeTracer = { + startSpan(name: string, options?: { attributes?: Record }): NativeSpan; + }; + + const providerSpan = (telemetryOrBuilder.tracer as NativeTracer).startSpan( + 'ai.streamText.doStream', + { + attributes: { + 'ai.operationId': 'ai.streamText.doStream', + 'langsmith.span.kind': 'llm', + }, + }, + ); + providerSpan.end(); + + await workspaceWriteFile.handler( + { path: 'workflow.json', content: '{}' }, + { toolCallId: 'toolu-write-file' }, + ); + }); + + await tracing!.finishRun(tracing!.orchestratorRun, { outputs: { status: 'done' } }); + await tracing!.finishRun(tracing!.rootRun, { outputs: { status: 'done' } }); + + const spans = agentsMock.getSpans(); + const rootSpan = spans.find((span) => span.name === 'instance-ai.message_turn'); + const orchestratorSpan = spans.find((span) => span.name === 'instance-ai.orchestrator.stream'); + const providerSpan = spans.find((span) => span.name === 'ai.streamText.doStream'); + const localToolSpan = spans.find((span) => span.name === 'instance-ai.tool.workspace_edit'); + + expect(rootSpan).toBeDefined(); + expect(orchestratorSpan).toBeDefined(); + expect(providerSpan).toBeDefined(); + expect(localToolSpan).toBeDefined(); + expect( + new Set( + [rootSpan, orchestratorSpan, providerSpan, localToolSpan].map((span) => span?.traceId), + ), + ).toEqual(new Set([rootSpan?.traceId])); + expect(orchestratorSpan?.parentSpanId).toBe(rootSpan?.id); + expect(providerSpan?.parentSpanId).toBe(orchestratorSpan?.id); + expect(localToolSpan?.parentSpanId).toBe(orchestratorSpan?.id); + expect(localToolSpan?.attributes.tool_call_id).toBe('toolu-write-file'); + expect(langsmithMock.getCreatedRunTrees()).toHaveLength(0); + }); + it('returns undefined when tracing is explicitly disabled even with proxy', async () => { process.env.LANGCHAIN_TRACING_V2 = 'false'; diff --git a/packages/@n8n/instance-ai/src/tracing/langsmith-tracing.ts b/packages/@n8n/instance-ai/src/tracing/langsmith-tracing.ts index d9114086a12..78530dbf746 100644 --- a/packages/@n8n/instance-ai/src/tracing/langsmith-tracing.ts +++ b/packages/@n8n/instance-ai/src/tracing/langsmith-tracing.ts @@ -102,6 +102,7 @@ const LANGSMITH_SPAN_KIND = 'langsmith.span.kind'; const LANGSMITH_SPAN_TAGS = 'langsmith.span.tags'; const GEN_AI_PROMPT = 'gen_ai.prompt'; const GEN_AI_COMPLETION = 'gen_ai.completion'; +const LEGACY_RUNTREE_TRACING_ENV = 'N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING'; interface ProductOtelTraceRuntime { telemetry: BuiltTelemetry; @@ -485,6 +486,10 @@ function ensureLangSmithTracingEnv(): void { process.env.LANGSMITH_TRACING ??= 'true'; } +function isLegacyRunTreeTracingEnabled(): boolean { + return process.env[LEGACY_RUNTREE_TRACING_ENV]?.toLowerCase() === 'true'; +} + function normalizeErrorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error); } @@ -997,6 +1002,10 @@ export async function submitLangsmithUserFeedback( } export function getTraceParentRun(): RunTree | undefined { + if (!isLegacyRunTreeTracingEnabled()) { + return undefined; + } + const overrideRun = traceParentOverrideStorage.getStore()?.current; if (overrideRun) { return overrideRun; @@ -1010,6 +1019,10 @@ export function getTraceParentRun(): RunTree | undefined { } export function setTraceParentOverride(parentRun: RunTree | null | undefined): void { + if (!isLegacyRunTreeTracingEnabled()) { + return; + } + const store = traceParentOverrideStorage.getStore(); if (store) { store.current = parentRun ?? null; @@ -1105,6 +1118,10 @@ export async function withTraceParentContext( parentRun: RunTree | undefined, fn: () => Promise, ): Promise { + if (!isLegacyRunTreeTracingEnabled()) { + return await fn(); + } + // Always create a new nested ALS context. Mutating an existing store.current // is not safe when concurrent background tasks inherit the same parent context. return await traceParentOverrideStorage.run({ current: parentRun ?? null }, fn); @@ -2044,6 +2061,7 @@ function createTelemetryFactory(options: { metadata, recordInputs: true, recordOutputs: true, + runtimeRootSpanEnabled: false, }; } @@ -2051,7 +2069,8 @@ function createTelemetryFactory(options: { .functionId(functionId) .metadata(metadata) .recordInputs(true) - .recordOutputs(true); + .recordOutputs(true) + .runtimeRootSpan(false); }; }