fix(instance-ai): suppress duplicate runtime trace spans

2026-05-31 08:46:58 +02:00 · 2026-05-05 17:32:52 +02:00 · 2026-05-05 17:32:52 +02:00 · 23fa642e26
commit 23fa642e26
parent e5e0cb97cd
9 changed files with 270 additions and 18 deletions
--- a/packages/@n8n/agents/src/tests/agent-runtime.test.ts
+++ b/packages/@n8n/agents/src/tests/agent-runtime.test.ts
@ -2073,6 +2073,39 @@ describe('AgentRuntime — telemetry propagation', () => {
 		expect(span.end).toHaveBeenCalledTimes(1);
 	});

+	it('can suppress the generic runtime root span while keeping native telemetry enabled', async () => {
+		generateText.mockResolvedValue(makeGenerateSuccess());
+		const tracer = {
+			startActiveSpan: jest.fn(),
+		};
+		const telemetry: BuiltTelemetry = {
+			...baseTelemetry,
+			runtimeRootSpanEnabled: false,
+			tracer,
+		};
+
+		const runtime = new AgentRuntime({
+			name: 'telemetry-root-test',
+			model: 'openai/gpt-4o-mini',
+			instructions: 'test',
+			eventBus: new AgentEventBus(),
+			telemetry,
+		});
+
+		await runtime.generate('hello');
+
+		expect(tracer.startActiveSpan).not.toHaveBeenCalled();
+		// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
+		const callArgs = generateText.mock.calls[0][0] as Record<string, unknown>;
+		expect(callArgs.experimental_telemetry).toEqual(
+			expect.objectContaining({
+				isEnabled: true,
+				functionId: 'test-agent',
+				tracer,
+			}),
+		);
+	});
+
 	it('adds a LangSmith tool catalog to telemetry root spans', async () => {
 		generateText.mockResolvedValue(makeGenerateSuccess());
 		const span = {
--- a/packages/@n8n/agents/src/tests/telemetry.test.ts
+++ b/packages/@n8n/agents/src/tests/telemetry.test.ts
@ -8,6 +8,7 @@ describe('Telemetry builder', () => {
 		expect(built.enabled).toBe(true);
 		expect(built.recordInputs).toBe(true);
 		expect(built.recordOutputs).toBe(true);
+		expect(built.runtimeRootSpanEnabled).toBe(true);
 		expect(built.functionId).toBeUndefined();
 		expect(built.metadata).toBeUndefined();
 		expect(built.integrations).toEqual([]);
@ -22,6 +23,7 @@ describe('Telemetry builder', () => {
 			.metadata({ team: 'platform', version: 2 })
 			.recordInputs(false)
 			.recordOutputs(false)
+			.runtimeRootSpan(false)
 			.build();

 		expect(built.enabled).toBe(false);
@ -29,6 +31,7 @@ describe('Telemetry builder', () => {
 		expect(built.metadata).toEqual({ team: 'platform', version: 2 });
 		expect(built.recordInputs).toBe(false);
 		expect(built.recordOutputs).toBe(false);
+		expect(built.runtimeRootSpanEnabled).toBe(false);
 	});

 	it('accepts a pre-built tracer', async () => {
--- a/packages/@n8n/agents/src/runtime/agent-runtime.ts
+++ b/packages/@n8n/agents/src/runtime/agent-runtime.ts
@ -745,7 +745,7 @@ export class AgentRuntime {
 		fn: () => Promise<T>,
 	): Promise<T> {
 		const t = this.resolveTelemetry();
-		if (!t?.enabled || !isActiveSpanTracer(t.tracer)) {
+		if (!t?.enabled || t.runtimeRootSpanEnabled === false || !isActiveSpanTracer(t.tracer)) {
 			return await fn();
 		}

--- a/packages/@n8n/agents/src/sdk/telemetry.ts
+++ b/packages/@n8n/agents/src/sdk/telemetry.ts
@ -153,6 +153,8 @@ export class Telemetry {

 	protected recordOutputsValue = true;

+	protected runtimeRootSpanEnabledValue = true;
+
 	protected redactFn?: RedactFn;

 	protected integrationsList: TelemetryIntegration[] = [];
@ -223,6 +225,12 @@ export class Telemetry {
 		return this;
 	}

+	/** Enable or disable the generic AgentRuntime root span around generate/stream loops. */
+	runtimeRootSpan(value: boolean): this {
+		this.runtimeRootSpanEnabledValue = value;
+		return this;
+	}
+
 	/**
 	 * Set a redaction callback. When set, all integration hooks will
 	 * have their event data passed through this function before the
@ -287,6 +295,7 @@ export class Telemetry {
 			metadata: this.metadataValue,
 			recordInputs: this.recordInputsValue,
 			recordOutputs: this.recordOutputsValue,
+			runtimeRootSpanEnabled: this.runtimeRootSpanEnabledValue,
 			integrations,
 			tracer,
 			provider,
--- a/packages/@n8n/agents/src/types/telemetry.ts
+++ b/packages/@n8n/agents/src/types/telemetry.ts
@ -26,6 +26,8 @@ export interface BuiltTelemetry {
 	readonly metadata?: Record<string, AttributeValue>;
 	readonly recordInputs: boolean;
 	readonly recordOutputs: boolean;
+	/** Whether AgentRuntime should add a generic chain span around generate/stream loops. */
+	readonly runtimeRootSpanEnabled?: boolean;
 	/** Integrations are pre-wrapped with redaction if .redact() was set at build time. */
 	readonly integrations: TelemetryIntegration[];
 	readonly tracer?: OpaqueTracer;
--- a/packages/@n8n/instance-ai/TRACING_SPECS.md
+++ b/packages/@n8n/instance-ai/TRACING_SPECS.md
@ -82,16 +82,22 @@ Implemented so far:
 - Normal foreground and detached trace creation no longer creates RunTree spans.
 - Agent tree snapshots persist OTel trace/span IDs alongside derived LangSmith
  IDs for feedback anchoring.
-
-Still wrong:
-
+- Instance AI disables the generic `@n8n/agents` runtime root span because the
+  product actor span already represents the agent loop; native provider and
+  `ai.toolCall` spans remain enabled and are parented directly under the
+  product actor span.
 - Live LangSmith validation has proved feedback against an OTel-only product
-  root; full provider-span validation with a real model turn is still pending.
- Some fallback RunTree compatibility code remains for legacy/replay-only
-  paths and should be deleted after rollout validation.
+  root and full provider-span visibility with a real model turn.
 - Detached sub-agent linking captures spawning trace/span metadata and model
  tool-call IDs when a detached task is spawned from a local tool handler.

+Remaining follow-up:
+
+- Some fallback RunTree compatibility code remains for legacy/manual stream
+  trace debugging only. It is disabled by default behind
+  `N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING=true` and should be removed in the
+  post-rollout cleanup once no legacy stream-hook consumers remain.
+
 ## Hybrid Reference Notes

 The last working hybrid traces showed RunTree product nodes such as
@ -107,6 +113,31 @@ longer shows the complete system/user/tool/provider turn under a single OTel
 context. Regression coverage now asserts normal Instance AI trace creation does
 not create RunTree spans.

+## Live Validation Notes
+
+Live validation with explicit credentials has covered two cases:
+
+- `instance-ai-tracing-validation` thread
+  `otel-validation-f97e5f00-589a-49fb-a536-d54d417c30eb` proved foreground
+  and detached roots are queryable by the same thread ID. The thread contained
+  `instance-ai.message_turn`, native `ai.generateText.doGenerate` spans with
+  tool definitions and token usage, a local tool span, and detached
+  `instance-ai.subagent.workflow-builder` metadata with spawning trace/span
+  IDs.
+- `instance-ai-tracing-validation` thread
+  `otel-runtime-root-validation-4c6d9b3c-ae3f-454a-bf97-d984be36a2be` proved
+  the generic `@n8n/agents` runtime root span can be suppressed for Instance
+  AI. The resulting foreground tree was
+  `instance-ai.message_turn -> instance-ai.orchestrator.stream ->
+  ai.generateText.doGenerate/add_numbers`, with no duplicate
+  `instance-ai.orchestrator.stream` wrapper.
+
+User-provided fresh run `81b3a657-c452-484f-ac3c-122836016094` confirmed the
+pre-suppression implementation had correct native provider/tool visibility,
+token usage, detached sub-agent roots, and spawning metadata, but still showed
+duplicate same-named agent wrapper spans. The runtime-root suppression is the
+follow-up fix for that shape issue.
+
 ## Target Architecture

 ```mermaid
@ -330,6 +361,13 @@ The noisy AI SDK wrapper spans such as `ai.streamText` may be filtered from
 LangSmith export as long as provider request spans, tool spans, and product
 root spans remain correctly parented.

+For Instance AI, the generic `@n8n/agents` runtime root span around
+generate/stream loops is disabled. Generic agents may still use that span, but
+Instance AI already has explicit product actor spans such as
+`instance-ai.orchestrator.stream` and `instance-ai.subagent.<role>.stream`.
+Disabling the generic wrapper avoids duplicate same-named agent spans while
+preserving native provider and tool telemetry.
+
 ## Span Kinds and Inputs

 Use LangSmith-compatible span attributes:
@ -462,7 +500,7 @@ must not require LangSmith to be available.
 - LangSmith OTLP tracer/provider construction
 - LangSmith OTel span filtering
 - mapping telemetry into AI SDK `experimental_telemetry`
- runtime root spans around generate/stream loops
+- optional runtime root spans around generate/stream loops
 - AI-SDK-compatible local `ai.toolCall` spans
 - provider flush and shutdown hooks
 - generic telemetry integration hooks
@ -476,6 +514,8 @@ must not require LangSmith to be available.
 - feedback snapshot persistence
 - service proxy request metadata and headers
 - detached sub-agent linking metadata
+- disabling generic runtime root spans when product actor spans are already
+  present
 - trace replay events

 `@n8n/ai-utilities` may own:
@ -526,7 +566,11 @@ must not require LangSmith to be available.
     outside the foreground context.
   - [x] Add spawning metadata: trace ID, span ID, tool call ID, task ID, and
     agent role.
-   - [ ] Confirm thread queries show detached roots alongside foreground turns.
+   - [x] Confirm thread queries show detached roots alongside foreground turns.
+     Live validation in `instance-ai-tracing-validation` for thread
+     `otel-validation-f97e5f00-589a-49fb-a536-d54d417c30eb` returned 9 runs
+     across 2 traces, including `instance-ai.message_turn` and
+     `instance-ai.subagent.workflow-builder`.

 6. Rework feedback anchoring

@ -539,15 +583,21 @@ must not require LangSmith to be available.

   - [x] Remove normal-path `RunTree` root creation.
   - [x] Remove normal-path manual RunTree tool wrappers.
-   - [ ] Keep only temporary compatibility code behind an explicit flag, if
+   - [x] Keep only temporary compatibility code behind an explicit flag, if
     needed for rollout.
-   - [ ] Delete compatibility code after validation.
+     The flag is `N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING=true`; it is disabled
+     by default.
+   - [x] ~~Delete compatibility code after validation.~~ Deferred to
+     post-rollout cleanup after legacy manual stream-hook consumers are proven
+     unused.

 8. Decouple replay from tracing

   - [x] Ensure replay records stable Instance AI events, not span IDs.
   - [x] Ensure replay tests pass with LangSmith disabled.
-   - [ ] Optionally emit replay-tagged OTel spans for debugging only.
+   - [x] ~~Optionally emit replay-tagged OTel spans for debugging only.~~ Not
+     implemented; replay remains LangSmith-independent and does not emit debug
+     traces by default.

 9. Add regression coverage

@ -555,9 +605,12 @@ must not require LangSmith to be available.
   - [x] Unit test OTel product span parentage.
   - [x] Unit test feedback ID persistence.
   - [x] Unit test redaction preserving token usage.
-   - [ ] Local exporter test proving one foreground message turn contains
+   - [x] Local exporter test proving one foreground message turn contains
     product spans, native provider spans, and local tool spans.
-   - [ ] Live LangSmith validation behind explicit credentials.
+   - [x] Live LangSmith validation behind explicit credentials.
+     The validation showed native `ai.generateText.doGenerate` spans under the
+     foreground product trace with system/user messages, tool definitions,
+     tool choice, token usage, and a local tool span.

 ## Acceptance Criteria

--- a/packages/@n8n/instance-ai/src/runtime/tests/resumable-stream-executor.test.ts
+++ b/packages/@n8n/instance-ai/src/runtime/tests/resumable-stream-executor.test.ts
@ -317,8 +317,19 @@ interface PublishedEvent {
 }

 describe('executeResumableStream', () => {
+	const originalLegacyRunTreeTracing = process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING;
+
 	beforeEach(() => {
 		langsmithMock.reset();
+		process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING = 'true';
+	});
+
+	afterAll(() => {
+		if (originalLegacyRunTreeTracing === undefined) {
+			delete process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING;
+		} else {
+			process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING = originalLegacyRunTreeTracing;
+		}
 	});

 	it('buffers the confirmation event in manual mode', async () => {
--- a/packages/@n8n/instance-ai/src/tracing/tests/langsmith-tracing.test.ts
+++ b/packages/@n8n/instance-ai/src/tracing/tests/langsmith-tracing.test.ts
@ -1,14 +1,23 @@
-import type { Context } from '@opentelemetry/api';
+import type { Context, ContextManager } from '@opentelemetry/api';
 import { jsonParse } from 'n8n-workflow';
+import type * as AsyncHooks from 'node:async_hooks';

 import { executeTool } from '../../__tests__/tool-test-utils';

 jest.mock('@n8n/agents', () => {
 	const actual = jest.requireActual<Record<string, unknown>>('@n8n/agents');
-	const { context, trace } = jest.requireActual<{
+	const { AsyncLocalStorage } = jest.requireActual<typeof AsyncHooks>('node:async_hooks');
+	const { ROOT_CONTEXT, context, trace } = jest.requireActual<{
+		ROOT_CONTEXT: Context;
 		context: {
 			active(): Context;
-			with<T>(ctx: Context, fn: () => T): T;
+			with<T>(
+				ctx: Context,
+				fn: (...args: unknown[]) => T,
+				thisArg?: unknown,
+				...args: unknown[]
+			): T;
+			setGlobalContextManager(contextManager: ContextManager): boolean;
 		};
 		trace: {
 			getSpan(ctx: Context): unknown;
@ -17,6 +26,16 @@ jest.mock('@n8n/agents', () => {
 	}>('@opentelemetry/api');

 	let spanCounter = 0;
+	const contextStorage = new AsyncLocalStorage<Context>();
+	const contextManager: ContextManager = {
+		active: () => contextStorage.getStore() ?? ROOT_CONTEXT,
+		with: (ctx, fn, thisArg, ...args) => contextStorage.run(ctx, () => fn.call(thisArg, ...args)),
+		bind: (_ctx, target) => target,
+		enable: () => contextManager,
+		disable: () => contextManager,
+	};
+	context.setGlobalContextManager(contextManager);
+
 	const spans: Array<{
 		id: string;
 		traceId: string;
@ -100,6 +119,7 @@ jest.mock('@n8n/agents', () => {
 		private metadataValue?: Record<string, unknown>;
 		private recordInputsValue = true;
 		private recordOutputsValue = true;
+		private runtimeRootSpanEnabledValue = true;

 		functionId(value: string): this {
 			this.functionIdValue = value;
@ -121,6 +141,11 @@ jest.mock('@n8n/agents', () => {
 			return this;
 		}

+		runtimeRootSpan(value: boolean): this {
+			this.runtimeRootSpanEnabledValue = value;
+			return this;
+		}
+
 		async build(): Promise<Record<string, unknown>> {
 			return await Promise.resolve({
 				enabled: true,
@ -128,6 +153,7 @@ jest.mock('@n8n/agents', () => {
 				metadata: this.metadataValue,
 				recordInputs: this.recordInputsValue,
 				recordOutputs: this.recordOutputsValue,
+				runtimeRootSpanEnabled: this.runtimeRootSpanEnabledValue,
 				integrations: [],
 				tracer,
 				provider,
@ -512,6 +538,7 @@ describe('createInstanceAiTraceContext', () => {
 		expect(telemetry.functionId).toBe('instance-ai.orchestrator');
 		expect(telemetry.recordInputs).toBe(true);
 		expect(telemetry.recordOutputs).toBe(true);
+		expect(telemetry.runtimeRootSpanEnabled).toBe(false);
 		expect(telemetry.metadata).toEqual(
 			expect.objectContaining({
 				thread_id: 'thread-1',
@ -726,6 +753,16 @@ describe('createInstanceAiTraceContext', () => {
 				spawned_by_tool_call_id: 'toolu-1',
 			}),
 		);
+
+		const telemetryOrBuilder = tracing!.getTelemetry!({
+			agentRole: 'workflow-builder',
+			functionId: 'instance-ai.subagent.workflow-builder',
+			executionMode: 'detached_subagent',
+		});
+		const telemetry =
+			'build' in telemetryOrBuilder ? await telemetryOrBuilder.build() : telemetryOrBuilder;
+
+		expect(telemetry.runtimeRootSpanEnabled).toBe(false);
 	});

 	it('attaches root agent config without duplicating it into llm steps', async () => {
@ -1145,6 +1182,91 @@ describe('createInstanceAiTraceContext', () => {
 		expect(langsmithMock.getCreatedRunTrees()).toHaveLength(0);
 	});

+	it('keeps product, native provider, and local tool spans in one foreground OTel trace', async () => {
+		const tracing = await createInstanceAiTraceContext({
+			threadId: 'thread-local-exporter',
+			messageId: 'message-local-exporter',
+			runId: 'run-local-exporter',
+			userId: 'user-local-exporter',
+			input: { message: 'Build a workflow' },
+		});
+
+		expect(tracing).toBeDefined();
+
+		const wrappedTools = tracing!.wrapTools(
+			{
+				workspace_write_file: {
+					name: 'workspace_write_file',
+					description: 'Write a file in the workspace.',
+					handler: jest.fn(async () => await Promise.resolve({ written: true })),
+				} as never,
+			},
+			{ agentRole: 'workflow-builder' },
+		);
+		const workspaceWriteFile = wrappedTools.workspace_write_file;
+		if (!isExecutableTool(workspaceWriteFile)) {
+			throw new Error('Wrapped workspace_write_file tool is not executable');
+		}
+
+		await tracing!.withRunTree(tracing!.orchestratorRun, async () => {
+			const telemetryOrBuilder = tracing!.getTelemetry!({
+				agentRole: 'orchestrator',
+				functionId: 'instance-ai.orchestrator',
+			});
+			if ('build' in telemetryOrBuilder) {
+				throw new Error('Expected foreground tracing to reuse built OTel telemetry');
+			}
+
+			type NativeSpan = {
+				end(): void;
+				spanContext(): { traceId: string; spanId: string };
+			};
+			type NativeTracer = {
+				startSpan(name: string, options?: { attributes?: Record<string, unknown> }): NativeSpan;
+			};
+
+			const providerSpan = (telemetryOrBuilder.tracer as NativeTracer).startSpan(
+				'ai.streamText.doStream',
+				{
+					attributes: {
+						'ai.operationId': 'ai.streamText.doStream',
+						'langsmith.span.kind': 'llm',
+					},
+				},
+			);
+			providerSpan.end();
+
+			await workspaceWriteFile.handler(
+				{ path: 'workflow.json', content: '{}' },
+				{ toolCallId: 'toolu-write-file' },
+			);
+		});
+
+		await tracing!.finishRun(tracing!.orchestratorRun, { outputs: { status: 'done' } });
+		await tracing!.finishRun(tracing!.rootRun, { outputs: { status: 'done' } });
+
+		const spans = agentsMock.getSpans();
+		const rootSpan = spans.find((span) => span.name === 'instance-ai.message_turn');
+		const orchestratorSpan = spans.find((span) => span.name === 'instance-ai.orchestrator.stream');
+		const providerSpan = spans.find((span) => span.name === 'ai.streamText.doStream');
+		const localToolSpan = spans.find((span) => span.name === 'instance-ai.tool.workspace_edit');
+
+		expect(rootSpan).toBeDefined();
+		expect(orchestratorSpan).toBeDefined();
+		expect(providerSpan).toBeDefined();
+		expect(localToolSpan).toBeDefined();
+		expect(
+			new Set(
+				[rootSpan, orchestratorSpan, providerSpan, localToolSpan].map((span) => span?.traceId),
+			),
+		).toEqual(new Set([rootSpan?.traceId]));
+		expect(orchestratorSpan?.parentSpanId).toBe(rootSpan?.id);
+		expect(providerSpan?.parentSpanId).toBe(orchestratorSpan?.id);
+		expect(localToolSpan?.parentSpanId).toBe(orchestratorSpan?.id);
+		expect(localToolSpan?.attributes.tool_call_id).toBe('toolu-write-file');
+		expect(langsmithMock.getCreatedRunTrees()).toHaveLength(0);
+	});
+
 	it('returns undefined when tracing is explicitly disabled even with proxy', async () => {
 		process.env.LANGCHAIN_TRACING_V2 = 'false';

--- a/packages/@n8n/instance-ai/src/tracing/langsmith-tracing.ts
+++ b/packages/@n8n/instance-ai/src/tracing/langsmith-tracing.ts
@ -102,6 +102,7 @@ const LANGSMITH_SPAN_KIND = 'langsmith.span.kind';
 const LANGSMITH_SPAN_TAGS = 'langsmith.span.tags';
 const GEN_AI_PROMPT = 'gen_ai.prompt';
 const GEN_AI_COMPLETION = 'gen_ai.completion';
+const LEGACY_RUNTREE_TRACING_ENV = 'N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING';

 interface ProductOtelTraceRuntime {
 	telemetry: BuiltTelemetry;
@ -485,6 +486,10 @@ function ensureLangSmithTracingEnv(): void {
 	process.env.LANGSMITH_TRACING ??= 'true';
 }

+function isLegacyRunTreeTracingEnabled(): boolean {
+	return process.env[LEGACY_RUNTREE_TRACING_ENV]?.toLowerCase() === 'true';
+}
+
 function normalizeErrorMessage(error: unknown): string {
 	return error instanceof Error ? error.message : String(error);
 }
@ -997,6 +1002,10 @@ export async function submitLangsmithUserFeedback(
 }

 export function getTraceParentRun(): RunTree | undefined {
+	if (!isLegacyRunTreeTracingEnabled()) {
+		return undefined;
+	}
+
 	const overrideRun = traceParentOverrideStorage.getStore()?.current;
 	if (overrideRun) {
 		return overrideRun;
@ -1010,6 +1019,10 @@ export function getTraceParentRun(): RunTree | undefined {
 }

 export function setTraceParentOverride(parentRun: RunTree | null | undefined): void {
+	if (!isLegacyRunTreeTracingEnabled()) {
+		return;
+	}
+
 	const store = traceParentOverrideStorage.getStore();
 	if (store) {
 		store.current = parentRun ?? null;
@ -1105,6 +1118,10 @@ export async function withTraceParentContext<T>(
 	parentRun: RunTree | undefined,
 	fn: () => Promise<T>,
 ): Promise<T> {
+	if (!isLegacyRunTreeTracingEnabled()) {
+		return await fn();
+	}
+
 	// Always create a new nested ALS context. Mutating an existing store.current
 	// is not safe when concurrent background tasks inherit the same parent context.
 	return await traceParentOverrideStorage.run({ current: parentRun ?? null }, fn);
@ -2044,6 +2061,7 @@ function createTelemetryFactory(options: {
 				metadata,
 				recordInputs: true,
 				recordOutputs: true,
+				runtimeRootSpanEnabled: false,
 			};
 		}

@ -2051,7 +2069,8 @@ function createTelemetryFactory(options: {
 			.functionId(functionId)
 			.metadata(metadata)
 			.recordInputs(true)
-			.recordOutputs(true);
+			.recordOutputs(true)
+			.runtimeRootSpan(false);
 	};
 }