fix(instance-ai): suppress duplicate runtime trace spans

This commit is contained in:
Oleg Ivaniv 2026-05-05 17:32:52 +02:00
parent e5e0cb97cd
commit 23fa642e26
No known key found for this signature in database
9 changed files with 270 additions and 18 deletions

View File

@ -2073,6 +2073,39 @@ describe('AgentRuntime — telemetry propagation', () => {
expect(span.end).toHaveBeenCalledTimes(1);
});
it('can suppress the generic runtime root span while keeping native telemetry enabled', async () => {
generateText.mockResolvedValue(makeGenerateSuccess());
const tracer = {
startActiveSpan: jest.fn(),
};
const telemetry: BuiltTelemetry = {
...baseTelemetry,
runtimeRootSpanEnabled: false,
tracer,
};
const runtime = new AgentRuntime({
name: 'telemetry-root-test',
model: 'openai/gpt-4o-mini',
instructions: 'test',
eventBus: new AgentEventBus(),
telemetry,
});
await runtime.generate('hello');
expect(tracer.startActiveSpan).not.toHaveBeenCalled();
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
const callArgs = generateText.mock.calls[0][0] as Record<string, unknown>;
expect(callArgs.experimental_telemetry).toEqual(
expect.objectContaining({
isEnabled: true,
functionId: 'test-agent',
tracer,
}),
);
});
it('adds a LangSmith tool catalog to telemetry root spans', async () => {
generateText.mockResolvedValue(makeGenerateSuccess());
const span = {

View File

@ -8,6 +8,7 @@ describe('Telemetry builder', () => {
expect(built.enabled).toBe(true);
expect(built.recordInputs).toBe(true);
expect(built.recordOutputs).toBe(true);
expect(built.runtimeRootSpanEnabled).toBe(true);
expect(built.functionId).toBeUndefined();
expect(built.metadata).toBeUndefined();
expect(built.integrations).toEqual([]);
@ -22,6 +23,7 @@ describe('Telemetry builder', () => {
.metadata({ team: 'platform', version: 2 })
.recordInputs(false)
.recordOutputs(false)
.runtimeRootSpan(false)
.build();
expect(built.enabled).toBe(false);
@ -29,6 +31,7 @@ describe('Telemetry builder', () => {
expect(built.metadata).toEqual({ team: 'platform', version: 2 });
expect(built.recordInputs).toBe(false);
expect(built.recordOutputs).toBe(false);
expect(built.runtimeRootSpanEnabled).toBe(false);
});
it('accepts a pre-built tracer', async () => {

View File

@ -745,7 +745,7 @@ export class AgentRuntime {
fn: () => Promise<T>,
): Promise<T> {
const t = this.resolveTelemetry();
if (!t?.enabled || !isActiveSpanTracer(t.tracer)) {
if (!t?.enabled || t.runtimeRootSpanEnabled === false || !isActiveSpanTracer(t.tracer)) {
return await fn();
}

View File

@ -153,6 +153,8 @@ export class Telemetry {
protected recordOutputsValue = true;
protected runtimeRootSpanEnabledValue = true;
protected redactFn?: RedactFn;
protected integrationsList: TelemetryIntegration[] = [];
@ -223,6 +225,12 @@ export class Telemetry {
return this;
}
/** Enable or disable the generic AgentRuntime root span around generate/stream loops. */
runtimeRootSpan(value: boolean): this {
this.runtimeRootSpanEnabledValue = value;
return this;
}
/**
* Set a redaction callback. When set, all integration hooks will
* have their event data passed through this function before the
@ -287,6 +295,7 @@ export class Telemetry {
metadata: this.metadataValue,
recordInputs: this.recordInputsValue,
recordOutputs: this.recordOutputsValue,
runtimeRootSpanEnabled: this.runtimeRootSpanEnabledValue,
integrations,
tracer,
provider,

View File

@ -26,6 +26,8 @@ export interface BuiltTelemetry {
readonly metadata?: Record<string, AttributeValue>;
readonly recordInputs: boolean;
readonly recordOutputs: boolean;
/** Whether AgentRuntime should add a generic chain span around generate/stream loops. */
readonly runtimeRootSpanEnabled?: boolean;
/** Integrations are pre-wrapped with redaction if .redact() was set at build time. */
readonly integrations: TelemetryIntegration[];
readonly tracer?: OpaqueTracer;

View File

@ -82,16 +82,22 @@ Implemented so far:
- Normal foreground and detached trace creation no longer creates RunTree spans.
- Agent tree snapshots persist OTel trace/span IDs alongside derived LangSmith
IDs for feedback anchoring.
Still wrong:
- Instance AI disables the generic `@n8n/agents` runtime root span because the
product actor span already represents the agent loop; native provider and
`ai.toolCall` spans remain enabled and are parented directly under the
product actor span.
- Live LangSmith validation has proved feedback against an OTel-only product
root; full provider-span validation with a real model turn is still pending.
- Some fallback RunTree compatibility code remains for legacy/replay-only
paths and should be deleted after rollout validation.
root and full provider-span visibility with a real model turn.
- Detached sub-agent linking captures spawning trace/span metadata and model
tool-call IDs when a detached task is spawned from a local tool handler.
Remaining follow-up:
- Some fallback RunTree compatibility code remains for legacy/manual stream
trace debugging only. It is disabled by default behind
`N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING=true` and should be removed in the
post-rollout cleanup once no legacy stream-hook consumers remain.
## Hybrid Reference Notes
The last working hybrid traces showed RunTree product nodes such as
@ -107,6 +113,31 @@ longer shows the complete system/user/tool/provider turn under a single OTel
context. Regression coverage now asserts normal Instance AI trace creation does
not create RunTree spans.
## Live Validation Notes
Live validation with explicit credentials has covered two cases:
- `instance-ai-tracing-validation` thread
`otel-validation-f97e5f00-589a-49fb-a536-d54d417c30eb` proved foreground
and detached roots are queryable by the same thread ID. The thread contained
`instance-ai.message_turn`, native `ai.generateText.doGenerate` spans with
tool definitions and token usage, a local tool span, and detached
`instance-ai.subagent.workflow-builder` metadata with spawning trace/span
IDs.
- `instance-ai-tracing-validation` thread
`otel-runtime-root-validation-4c6d9b3c-ae3f-454a-bf97-d984be36a2be` proved
the generic `@n8n/agents` runtime root span can be suppressed for Instance
AI. The resulting foreground tree was
`instance-ai.message_turn -> instance-ai.orchestrator.stream ->
ai.generateText.doGenerate/add_numbers`, with no duplicate
`instance-ai.orchestrator.stream` wrapper.
User-provided fresh run `81b3a657-c452-484f-ac3c-122836016094` confirmed the
pre-suppression implementation had correct native provider/tool visibility,
token usage, detached sub-agent roots, and spawning metadata, but still showed
duplicate same-named agent wrapper spans. The runtime-root suppression is the
follow-up fix for that shape issue.
## Target Architecture
```mermaid
@ -330,6 +361,13 @@ The noisy AI SDK wrapper spans such as `ai.streamText` may be filtered from
LangSmith export as long as provider request spans, tool spans, and product
root spans remain correctly parented.
For Instance AI, the generic `@n8n/agents` runtime root span around
generate/stream loops is disabled. Generic agents may still use that span, but
Instance AI already has explicit product actor spans such as
`instance-ai.orchestrator.stream` and `instance-ai.subagent.<role>.stream`.
Disabling the generic wrapper avoids duplicate same-named agent spans while
preserving native provider and tool telemetry.
## Span Kinds and Inputs
Use LangSmith-compatible span attributes:
@ -462,7 +500,7 @@ must not require LangSmith to be available.
- LangSmith OTLP tracer/provider construction
- LangSmith OTel span filtering
- mapping telemetry into AI SDK `experimental_telemetry`
- runtime root spans around generate/stream loops
- optional runtime root spans around generate/stream loops
- AI-SDK-compatible local `ai.toolCall` spans
- provider flush and shutdown hooks
- generic telemetry integration hooks
@ -476,6 +514,8 @@ must not require LangSmith to be available.
- feedback snapshot persistence
- service proxy request metadata and headers
- detached sub-agent linking metadata
- disabling generic runtime root spans when product actor spans are already
present
- trace replay events
`@n8n/ai-utilities` may own:
@ -526,7 +566,11 @@ must not require LangSmith to be available.
outside the foreground context.
- [x] Add spawning metadata: trace ID, span ID, tool call ID, task ID, and
agent role.
- [ ] Confirm thread queries show detached roots alongside foreground turns.
- [x] Confirm thread queries show detached roots alongside foreground turns.
Live validation in `instance-ai-tracing-validation` for thread
`otel-validation-f97e5f00-589a-49fb-a536-d54d417c30eb` returned 9 runs
across 2 traces, including `instance-ai.message_turn` and
`instance-ai.subagent.workflow-builder`.
6. Rework feedback anchoring
@ -539,15 +583,21 @@ must not require LangSmith to be available.
- [x] Remove normal-path `RunTree` root creation.
- [x] Remove normal-path manual RunTree tool wrappers.
- [ ] Keep only temporary compatibility code behind an explicit flag, if
- [x] Keep only temporary compatibility code behind an explicit flag, if
needed for rollout.
- [ ] Delete compatibility code after validation.
The flag is `N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING=true`; it is disabled
by default.
- [x] ~~Delete compatibility code after validation.~~ Deferred to
post-rollout cleanup after legacy manual stream-hook consumers are proven
unused.
8. Decouple replay from tracing
- [x] Ensure replay records stable Instance AI events, not span IDs.
- [x] Ensure replay tests pass with LangSmith disabled.
- [ ] Optionally emit replay-tagged OTel spans for debugging only.
- [x] ~~Optionally emit replay-tagged OTel spans for debugging only.~~ Not
implemented; replay remains LangSmith-independent and does not emit debug
traces by default.
9. Add regression coverage
@ -555,9 +605,12 @@ must not require LangSmith to be available.
- [x] Unit test OTel product span parentage.
- [x] Unit test feedback ID persistence.
- [x] Unit test redaction preserving token usage.
- [ ] Local exporter test proving one foreground message turn contains
- [x] Local exporter test proving one foreground message turn contains
product spans, native provider spans, and local tool spans.
- [ ] Live LangSmith validation behind explicit credentials.
- [x] Live LangSmith validation behind explicit credentials.
The validation showed native `ai.generateText.doGenerate` spans under the
foreground product trace with system/user messages, tool definitions,
tool choice, token usage, and a local tool span.
## Acceptance Criteria

View File

@ -317,8 +317,19 @@ interface PublishedEvent {
}
describe('executeResumableStream', () => {
const originalLegacyRunTreeTracing = process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING;
beforeEach(() => {
langsmithMock.reset();
process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING = 'true';
});
afterAll(() => {
if (originalLegacyRunTreeTracing === undefined) {
delete process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING;
} else {
process.env.N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING = originalLegacyRunTreeTracing;
}
});
it('buffers the confirmation event in manual mode', async () => {

View File

@ -1,14 +1,23 @@
import type { Context } from '@opentelemetry/api';
import type { Context, ContextManager } from '@opentelemetry/api';
import { jsonParse } from 'n8n-workflow';
import type * as AsyncHooks from 'node:async_hooks';
import { executeTool } from '../../__tests__/tool-test-utils';
jest.mock('@n8n/agents', () => {
const actual = jest.requireActual<Record<string, unknown>>('@n8n/agents');
const { context, trace } = jest.requireActual<{
const { AsyncLocalStorage } = jest.requireActual<typeof AsyncHooks>('node:async_hooks');
const { ROOT_CONTEXT, context, trace } = jest.requireActual<{
ROOT_CONTEXT: Context;
context: {
active(): Context;
with<T>(ctx: Context, fn: () => T): T;
with<T>(
ctx: Context,
fn: (...args: unknown[]) => T,
thisArg?: unknown,
...args: unknown[]
): T;
setGlobalContextManager(contextManager: ContextManager): boolean;
};
trace: {
getSpan(ctx: Context): unknown;
@ -17,6 +26,16 @@ jest.mock('@n8n/agents', () => {
}>('@opentelemetry/api');
let spanCounter = 0;
const contextStorage = new AsyncLocalStorage<Context>();
const contextManager: ContextManager = {
active: () => contextStorage.getStore() ?? ROOT_CONTEXT,
with: (ctx, fn, thisArg, ...args) => contextStorage.run(ctx, () => fn.call(thisArg, ...args)),
bind: (_ctx, target) => target,
enable: () => contextManager,
disable: () => contextManager,
};
context.setGlobalContextManager(contextManager);
const spans: Array<{
id: string;
traceId: string;
@ -100,6 +119,7 @@ jest.mock('@n8n/agents', () => {
private metadataValue?: Record<string, unknown>;
private recordInputsValue = true;
private recordOutputsValue = true;
private runtimeRootSpanEnabledValue = true;
functionId(value: string): this {
this.functionIdValue = value;
@ -121,6 +141,11 @@ jest.mock('@n8n/agents', () => {
return this;
}
runtimeRootSpan(value: boolean): this {
this.runtimeRootSpanEnabledValue = value;
return this;
}
async build(): Promise<Record<string, unknown>> {
return await Promise.resolve({
enabled: true,
@ -128,6 +153,7 @@ jest.mock('@n8n/agents', () => {
metadata: this.metadataValue,
recordInputs: this.recordInputsValue,
recordOutputs: this.recordOutputsValue,
runtimeRootSpanEnabled: this.runtimeRootSpanEnabledValue,
integrations: [],
tracer,
provider,
@ -512,6 +538,7 @@ describe('createInstanceAiTraceContext', () => {
expect(telemetry.functionId).toBe('instance-ai.orchestrator');
expect(telemetry.recordInputs).toBe(true);
expect(telemetry.recordOutputs).toBe(true);
expect(telemetry.runtimeRootSpanEnabled).toBe(false);
expect(telemetry.metadata).toEqual(
expect.objectContaining({
thread_id: 'thread-1',
@ -726,6 +753,16 @@ describe('createInstanceAiTraceContext', () => {
spawned_by_tool_call_id: 'toolu-1',
}),
);
const telemetryOrBuilder = tracing!.getTelemetry!({
agentRole: 'workflow-builder',
functionId: 'instance-ai.subagent.workflow-builder',
executionMode: 'detached_subagent',
});
const telemetry =
'build' in telemetryOrBuilder ? await telemetryOrBuilder.build() : telemetryOrBuilder;
expect(telemetry.runtimeRootSpanEnabled).toBe(false);
});
it('attaches root agent config without duplicating it into llm steps', async () => {
@ -1145,6 +1182,91 @@ describe('createInstanceAiTraceContext', () => {
expect(langsmithMock.getCreatedRunTrees()).toHaveLength(0);
});
it('keeps product, native provider, and local tool spans in one foreground OTel trace', async () => {
const tracing = await createInstanceAiTraceContext({
threadId: 'thread-local-exporter',
messageId: 'message-local-exporter',
runId: 'run-local-exporter',
userId: 'user-local-exporter',
input: { message: 'Build a workflow' },
});
expect(tracing).toBeDefined();
const wrappedTools = tracing!.wrapTools(
{
workspace_write_file: {
name: 'workspace_write_file',
description: 'Write a file in the workspace.',
handler: jest.fn(async () => await Promise.resolve({ written: true })),
} as never,
},
{ agentRole: 'workflow-builder' },
);
const workspaceWriteFile = wrappedTools.workspace_write_file;
if (!isExecutableTool(workspaceWriteFile)) {
throw new Error('Wrapped workspace_write_file tool is not executable');
}
await tracing!.withRunTree(tracing!.orchestratorRun, async () => {
const telemetryOrBuilder = tracing!.getTelemetry!({
agentRole: 'orchestrator',
functionId: 'instance-ai.orchestrator',
});
if ('build' in telemetryOrBuilder) {
throw new Error('Expected foreground tracing to reuse built OTel telemetry');
}
type NativeSpan = {
end(): void;
spanContext(): { traceId: string; spanId: string };
};
type NativeTracer = {
startSpan(name: string, options?: { attributes?: Record<string, unknown> }): NativeSpan;
};
const providerSpan = (telemetryOrBuilder.tracer as NativeTracer).startSpan(
'ai.streamText.doStream',
{
attributes: {
'ai.operationId': 'ai.streamText.doStream',
'langsmith.span.kind': 'llm',
},
},
);
providerSpan.end();
await workspaceWriteFile.handler(
{ path: 'workflow.json', content: '{}' },
{ toolCallId: 'toolu-write-file' },
);
});
await tracing!.finishRun(tracing!.orchestratorRun, { outputs: { status: 'done' } });
await tracing!.finishRun(tracing!.rootRun, { outputs: { status: 'done' } });
const spans = agentsMock.getSpans();
const rootSpan = spans.find((span) => span.name === 'instance-ai.message_turn');
const orchestratorSpan = spans.find((span) => span.name === 'instance-ai.orchestrator.stream');
const providerSpan = spans.find((span) => span.name === 'ai.streamText.doStream');
const localToolSpan = spans.find((span) => span.name === 'instance-ai.tool.workspace_edit');
expect(rootSpan).toBeDefined();
expect(orchestratorSpan).toBeDefined();
expect(providerSpan).toBeDefined();
expect(localToolSpan).toBeDefined();
expect(
new Set(
[rootSpan, orchestratorSpan, providerSpan, localToolSpan].map((span) => span?.traceId),
),
).toEqual(new Set([rootSpan?.traceId]));
expect(orchestratorSpan?.parentSpanId).toBe(rootSpan?.id);
expect(providerSpan?.parentSpanId).toBe(orchestratorSpan?.id);
expect(localToolSpan?.parentSpanId).toBe(orchestratorSpan?.id);
expect(localToolSpan?.attributes.tool_call_id).toBe('toolu-write-file');
expect(langsmithMock.getCreatedRunTrees()).toHaveLength(0);
});
it('returns undefined when tracing is explicitly disabled even with proxy', async () => {
process.env.LANGCHAIN_TRACING_V2 = 'false';

View File

@ -102,6 +102,7 @@ const LANGSMITH_SPAN_KIND = 'langsmith.span.kind';
const LANGSMITH_SPAN_TAGS = 'langsmith.span.tags';
const GEN_AI_PROMPT = 'gen_ai.prompt';
const GEN_AI_COMPLETION = 'gen_ai.completion';
const LEGACY_RUNTREE_TRACING_ENV = 'N8N_INSTANCE_AI_LEGACY_RUNTREE_TRACING';
interface ProductOtelTraceRuntime {
telemetry: BuiltTelemetry;
@ -485,6 +486,10 @@ function ensureLangSmithTracingEnv(): void {
process.env.LANGSMITH_TRACING ??= 'true';
}
function isLegacyRunTreeTracingEnabled(): boolean {
return process.env[LEGACY_RUNTREE_TRACING_ENV]?.toLowerCase() === 'true';
}
function normalizeErrorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
@ -997,6 +1002,10 @@ export async function submitLangsmithUserFeedback(
}
export function getTraceParentRun(): RunTree | undefined {
if (!isLegacyRunTreeTracingEnabled()) {
return undefined;
}
const overrideRun = traceParentOverrideStorage.getStore()?.current;
if (overrideRun) {
return overrideRun;
@ -1010,6 +1019,10 @@ export function getTraceParentRun(): RunTree | undefined {
}
export function setTraceParentOverride(parentRun: RunTree | null | undefined): void {
if (!isLegacyRunTreeTracingEnabled()) {
return;
}
const store = traceParentOverrideStorage.getStore();
if (store) {
store.current = parentRun ?? null;
@ -1105,6 +1118,10 @@ export async function withTraceParentContext<T>(
parentRun: RunTree | undefined,
fn: () => Promise<T>,
): Promise<T> {
if (!isLegacyRunTreeTracingEnabled()) {
return await fn();
}
// Always create a new nested ALS context. Mutating an existing store.current
// is not safe when concurrent background tasks inherit the same parent context.
return await traceParentOverrideStorage.run({ current: parentRun ?? null }, fn);
@ -2044,6 +2061,7 @@ function createTelemetryFactory(options: {
metadata,
recordInputs: true,
recordOutputs: true,
runtimeRootSpanEnabled: false,
};
}
@ -2051,7 +2069,8 @@ function createTelemetryFactory(options: {
.functionId(functionId)
.metadata(metadata)
.recordInputs(true)
.recordOutputs(true);
.recordOutputs(true)
.runtimeRootSpan(false);
};
}