n8n/packages/@n8n/instance-ai/evaluations/langsmith/dataset-sync.ts
José Braulio González Valido 30d9a168bc
feat(ai-builder): Add --prebuilt-workflows flag for eval CLI (no-changelog) (#29830)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 11:47:29 +00:00

288 lines
9.2 KiB
TypeScript

// ---------------------------------------------------------------------------
// LangSmith dataset sync
//
// Syncs JSON test case files from the repo to a LangSmith dataset. Existing
// examples are found by inputs (testCaseFile + scenarioName) and updated in
// place; new scenarios get a random UUID. Stale examples are left in place
// — LangSmith's soft-delete tombstones UUIDs, which historically caused 409
// conflicts on resurrection; manual orphan cleanup happens via UI or MCP.
// ---------------------------------------------------------------------------
import { randomUUID } from 'crypto';
import type { Client } from 'langsmith';
import type { Example, KVMap } from 'langsmith/schemas';
import { z } from 'zod';
import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
import type { EvalLogger } from '../harness/logger';
/**
* Shape of the inputs passed to the target function for each scenario.
* `testCaseFile` is included so the LangSmith Inputs column shows which
* workflow a scenario belongs to (metadata is hidden by default).
*/
export const datasetExampleInputsSchema = z.object({
prompt: z.string(),
testCaseFile: z.string(),
scenarioName: z.string(),
scenarioDescription: z.string(),
dataSetup: z.string(),
successCriteria: z.string(),
});
export type DatasetExampleInputs = z.infer<typeof datasetExampleInputsSchema>;
/** Metadata attached to each example for filtering / grouping in the UI. */
export const datasetExampleMetadataSchema = z.object({
/** Duplicated from inputs so the LangSmith UI can group by it (only metadata keys are groupable). */
testCaseFile: z.string(),
complexity: z.enum(['simple', 'medium', 'complex']).optional(),
tags: z.array(z.string()).optional(),
triggerType: z.enum(['manual', 'webhook', 'schedule', 'form']).optional(),
});
export type DatasetExampleMetadata = z.infer<typeof datasetExampleMetadataSchema>;
/**
* Sync JSON test cases to a LangSmith dataset.
*
* - Creates the dataset if it doesn't exist
* - Finds existing examples by (testCaseFile, scenarioName) and updates in place
* - Creates new scenarios with a random UUID
* - Orders examples round-robin across test cases for optimal parallelism
* - Assigns each example to a split (test case file slug) for UI filtering
*
* Never deletes. Orphan cleanup is manual (LangSmith UI or MCP).
*
* Returns the dataset name for use with evaluate().
*/
export async function syncDataset(
lsClient: Client,
datasetName: string,
logger: EvalLogger,
filter?: string,
exclude?: string,
): Promise<string> {
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(filter, exclude);
// Round-robin ordering ensures evaluate() triggers diverse builds early
// rather than burning all concurrency slots on one test case.
const scenarios = buildRoundRobinScenarios(testCasesWithFiles);
logger.info(
`Dataset sync: ${String(scenarios.length)} scenarios from ${String(testCasesWithFiles.length)} test cases`,
);
// Create or get dataset. `hasDataset` distinguishes "not found" from auth/
// network errors, so we only create when it genuinely doesn't exist.
let datasetId: string;
if (await lsClient.hasDataset({ datasetName })) {
const dataset = await lsClient.readDataset({ datasetName });
datasetId = dataset.id;
} else {
const dataset = await lsClient.createDataset(datasetName, {
description: 'Instance AI workflow execution evaluations (synced from repo JSON files)',
});
datasetId = dataset.id;
logger.info(`Created dataset: ${datasetName}`);
}
// List existing examples, keyed by derived ID (testCaseFile/scenarioName from inputs).
const existingByDerivedId = new Map<string, Example>();
for await (const example of lsClient.listExamples({ datasetId })) {
const inputs = existingInputsSchema.safeParse(example.inputs);
if (!inputs.success) continue;
existingByDerivedId.set(`${inputs.data.testCaseFile}/${inputs.data.scenarioName}`, example);
}
// Diff and sync
const toCreate: Array<{ id: string; inputs: KVMap; metadata: KVMap; split: string }> = [];
const toUpdate: Array<{ id: string; inputs: KVMap; metadata: KVMap; split: string }> = [];
for (const scenario of scenarios) {
const derivedId = `${scenario.testCaseFile}/${scenario.scenarioName}`;
const inputs: DatasetExampleInputs = {
prompt: scenario.prompt,
testCaseFile: scenario.testCaseFile,
scenarioName: scenario.scenarioName,
scenarioDescription: scenario.scenarioDescription,
dataSetup: scenario.dataSetup,
successCriteria: scenario.successCriteria,
};
const metadata: DatasetExampleMetadata = {
testCaseFile: scenario.testCaseFile,
complexity: scenario.complexity,
tags: scenario.tags,
triggerType: scenario.triggerType,
};
const existingExample = existingByDerivedId.get(derivedId);
if (existingExample) {
if (
hasInputsChanged(existingExample.inputs, inputs) ||
hasMetadataChanged(existingExample.metadata, metadata)
) {
toUpdate.push({
id: existingExample.id,
inputs,
metadata,
split: scenario.testCaseFile,
});
}
} else {
toCreate.push({
id: randomUUID(),
inputs,
metadata,
split: scenario.testCaseFile,
});
}
}
if (toCreate.length > 0) {
await lsClient.createExamples(
toCreate.map((e) => ({
id: e.id,
inputs: e.inputs,
metadata: e.metadata,
split: e.split,
dataset_id: datasetId,
})),
);
logger.info(` Created ${String(toCreate.length)} example(s)`);
}
if (toUpdate.length > 0) {
await lsClient.updateExamples(
toUpdate.map((e) => ({
id: e.id,
inputs: e.inputs,
metadata: e.metadata,
split: e.split,
dataset_id: datasetId,
})),
);
logger.info(` Updated ${String(toUpdate.length)} example(s)`);
}
if (toCreate.length === 0 && toUpdate.length === 0) {
logger.info(' Dataset up to date');
}
return datasetName;
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
interface FlatScenario {
prompt: string;
testCaseFile: string;
scenarioName: string;
scenarioDescription: string;
dataSetup: string;
successCriteria: string;
complexity?: 'simple' | 'medium' | 'complex';
tags?: string[];
triggerType?: 'manual' | 'webhook' | 'schedule' | 'form';
}
/**
* Flatten test cases into scenarios ordered round-robin across test cases.
*
* Input: [tc1(s1,s2,s3), tc2(s1,s2), tc3(s1)]
* Output: [tc1/s1, tc2/s1, tc3/s1, tc1/s2, tc2/s2, tc1/s3]
*/
function buildRoundRobinScenarios(
testCasesWithFiles: Array<{
testCase: {
prompt: string;
complexity?: 'simple' | 'medium' | 'complex';
tags?: string[];
triggerType?: 'manual' | 'webhook' | 'schedule' | 'form';
scenarios: Array<{
name: string;
description: string;
dataSetup: string;
successCriteria: string;
}>;
};
fileSlug: string;
}>,
): FlatScenario[] {
const result: FlatScenario[] = [];
const maxScenarios = Math.max(...testCasesWithFiles.map((tc) => tc.testCase.scenarios.length), 0);
for (let i = 0; i < maxScenarios; i++) {
for (const { testCase, fileSlug } of testCasesWithFiles) {
const scenario = testCase.scenarios[i];
if (scenario) {
result.push({
prompt: testCase.prompt,
testCaseFile: fileSlug,
scenarioName: scenario.name,
scenarioDescription: scenario.description,
dataSetup: scenario.dataSetup,
successCriteria: scenario.successCriteria,
complexity: testCase.complexity,
tags: testCase.tags,
triggerType: testCase.triggerType,
});
}
}
}
return result;
}
// Schemas for reading existing LangSmith example data, which is typed as an
// open KVMap by the SDK. We only parse the fields we care about for diffing.
const existingInputsSchema = z
.object({
prompt: z.string().default(''),
testCaseFile: z.string().default(''),
scenarioName: z.string().default(''),
scenarioDescription: z.string().default(''),
dataSetup: z.string().default(''),
successCriteria: z.string().default(''),
})
.passthrough();
const existingMetadataSchema = z
.object({
testCaseFile: z.string().default(''),
complexity: z.string().default(''),
triggerType: z.string().default(''),
tags: z.array(z.string()).default([]),
})
.passthrough();
function hasInputsChanged(existing: unknown, incoming: DatasetExampleInputs): boolean {
// Treat unparseable existing data as changed so we overwrite with fresh
// values rather than aborting the whole sync on one malformed row.
const parsed = existingInputsSchema.safeParse(existing ?? {});
if (!parsed.success) return true;
const e = parsed.data;
return (
e.prompt !== incoming.prompt ||
e.testCaseFile !== incoming.testCaseFile ||
e.dataSetup !== incoming.dataSetup ||
e.successCriteria !== incoming.successCriteria ||
e.scenarioDescription !== incoming.scenarioDescription
);
}
function hasMetadataChanged(existing: unknown, incoming: DatasetExampleMetadata): boolean {
const parsed = existingMetadataSchema.safeParse(existing ?? {});
if (!parsed.success) return true;
const e = parsed.data;
return (
e.testCaseFile !== incoming.testCaseFile ||
e.complexity !== (incoming.complexity ?? '') ||
e.triggerType !== (incoming.triggerType ?? '') ||
JSON.stringify(e.tags) !== JSON.stringify(incoming.tags ?? [])
);
}