mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-05-30 16:16:50 +02:00
feat(KB): conditional warnings A + B on Stored Files (RFC #883 §6)
Surfaces two silent failure modes that the prior binary "any-chunks-in-Qdrant ⇒ embedded" check could not distinguish from healthy ingestion: - **Warning A — Zero-chunk file** (file_size > 100 MB, chunks = 0) Fires on video-only / image-only ZIMs (`lrnselfreliance_en_all`, TED talks, etc.) that the pipeline completes "successfully" with no extractable text. AI Assistant literally cannot reference these. - **Warning B — Partial-embed stall** (chunks < 50% of expected from the ratio registry). Surfaces the simple_wiki "266 of 600,000 chunks" case observed during NOMAD1 ingestion testing — previously these looked identical to fully-completed embeds in the UI. Both warnings render only when their condition is met (silent by default; noisy only on real problems). Base is `feat/kb-ratio-registry` (#891) because Warning B's "expected chunks" estimate comes from `KbRatioRegistry.estimateChunks()`. GitHub fast-forwards to `rc` once #891 merges. - `app/utils/kb_warning_decision.ts` — pure `decideWarnings(inputs)` with thresholds (`100 MB`, `0.5×`) as exported constants. 10 unit tests cover the healthy case, both warnings, the under/at/over boundary, the registry-miss suppression, and the video-only registry case (`expectedChunks: 0` correctly skips Warning B). - `RagService.computeFileWarnings()` — single Qdrant scroll tallies chunks per source, filesystem walk fills in zero-chunk files, ratio registry estimates the expectation, decision function emits. - New endpoint `GET /api/rag/file-warnings` returns `Record<source, FileWarning[]>` (sources with no warnings are omitted, so the frontend can `warnings[source] ?? []` for clean defaults). - KB modal: warnings render inline under the file name as amber-tinted pills. Polled every 30s alongside the existing health check. - Warning C — chunks skipped due to length. PR #890 (#881 fix) prevents the silent drop at the embed boundary, so the underlying condition shouldn't fire anymore. If we still want to surface "we truncated N chunks to fit", that needs separate `skipped_count` tracking in EmbedFileJob — a Phase 2 follow-up. - Suppressing Warning B during active mid-ingestion. The user can cross- reference the Processing Queue to know it's in-flight; suppressing warnings while a job runs would mask real stalls where the job died mid-batch. Will revisit when per-card status is wired through. - Use of `kb_ingest_state.chunks_embedded` (#888) as the chunk count source. This PR uses Qdrant scroll directly so it can land independently of #888. - 10 new unit tests on `decideWarnings`, all pass - Type-check clean - Hot-patch + browser smoke test deferred until #891 lands (the ratio registry needs to exist in the DB for `estimateChunks()` to return non-null estimates — without it, only Warning A fires which is still useful but Warning B stays dormant)
This commit is contained in:
parent
ab8281d08b
commit
7c2282acf1
|
|
@ -68,6 +68,11 @@ export default class RagController {
|
|||
return response.status(200).json({ files })
|
||||
}
|
||||
|
||||
public async getFileWarnings({ response }: HttpContext) {
|
||||
const warnings = await this.ragService.computeFileWarnings()
|
||||
return response.status(200).json({ warnings })
|
||||
}
|
||||
|
||||
public async deleteFile({ request, response }: HttpContext) {
|
||||
const { source } = await request.validateUsing(deleteFileSchema)
|
||||
const result = await this.ragService.deleteFileBySource(source)
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@ import { join, resolve, sep } from 'node:path'
|
|||
import KVStore from '#models/kv_store'
|
||||
import KbIngestState from '#models/kb_ingest_state'
|
||||
import { decideScanAction, type IngestPolicy } from '../utils/kb_ingest_decision.js'
|
||||
import KbRatioRegistry from '#models/kb_ratio_registry'
|
||||
import { decideWarnings, type FileWarning } from '../utils/kb_warning_decision.js'
|
||||
import { ZIMExtractionService } from './zim_extraction_service.js'
|
||||
import { ZIM_BATCH_SIZE } from '../../constants/zim_extraction.js'
|
||||
import { ProcessAndEmbedFileResponse, ProcessZIMFileResponse, RAGResult, RerankedRAGResult } from '../../types/rag.js'
|
||||
|
|
@ -1086,6 +1088,90 @@ export class RagService {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute conditional warnings (RFC #883 §6) for every source the scanner
|
||||
* sees on disk. Returns a map from source path → list of warnings, with
|
||||
* sources that have no warnings omitted entirely (so the frontend can
|
||||
* `warningsBySource[source] ?? []` for clean defaults).
|
||||
*
|
||||
* Per-source chunk counts come from a single Qdrant scroll over the
|
||||
* collection's points; expected-chunk estimates come from the ratio
|
||||
* registry. Files in the scanner's directories that have no qdrant points
|
||||
* at all show up with `chunksInQdrant: 0` so Warning A can fire.
|
||||
*/
|
||||
public async computeFileWarnings(): Promise<Record<string, FileWarning[]>> {
|
||||
try {
|
||||
await this._ensureCollection(
|
||||
RagService.CONTENT_COLLECTION_NAME,
|
||||
RagService.EMBEDDING_DIMENSION
|
||||
)
|
||||
|
||||
// Per-source chunk count from a single scroll. We deliberately don't
|
||||
// assume `kb_ingest_state.chunks_embedded` here so this PR stays
|
||||
// independent of the state-machine PR (#888) — but a future cleanup can
|
||||
// read from there for efficiency once both have landed.
|
||||
const chunksBySource = new Map<string, number>()
|
||||
let offset: string | number | null | Record<string, unknown> = null
|
||||
const batchSize = 100
|
||||
do {
|
||||
const scrollResult = await this.qdrant!.scroll(RagService.CONTENT_COLLECTION_NAME, {
|
||||
limit: batchSize,
|
||||
offset,
|
||||
with_payload: ['source'],
|
||||
with_vector: false,
|
||||
})
|
||||
for (const point of scrollResult.points) {
|
||||
const source = point.payload?.source
|
||||
if (source && typeof source === 'string') {
|
||||
chunksBySource.set(source, (chunksBySource.get(source) ?? 0) + 1)
|
||||
}
|
||||
}
|
||||
offset = scrollResult.next_page_offset || null
|
||||
} while (offset !== null)
|
||||
|
||||
// Scan the filesystem the same way scanAndSyncStorage does so Warning A
|
||||
// can fire on files with zero qdrant points (the headline "video-only
|
||||
// ZIM" case).
|
||||
const KB_UPLOADS_PATH = join(process.cwd(), RagService.UPLOADS_STORAGE_PATH)
|
||||
const ZIM_PATH = join(process.cwd(), ZIM_STORAGE_PATH)
|
||||
const allSources = new Set<string>(chunksBySource.keys())
|
||||
const sizeByPath = new Map<string, number>()
|
||||
|
||||
for (const dir of [KB_UPLOADS_PATH, ZIM_PATH]) {
|
||||
try {
|
||||
const entries = await listDirectoryContentsRecursive(dir)
|
||||
for (const entry of entries) {
|
||||
if (entry.type !== 'file') continue
|
||||
allSources.add(entry.key)
|
||||
const stat = await getFileStatsIfExists(entry.key)
|
||||
if (stat) sizeByPath.set(entry.key, Number(stat.size))
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error?.code !== 'ENOENT') throw error
|
||||
}
|
||||
}
|
||||
|
||||
const out: Record<string, FileWarning[]> = {}
|
||||
for (const source of allSources) {
|
||||
const fileSizeBytes = sizeByPath.get(source) ?? 0
|
||||
const chunksInQdrant = chunksBySource.get(source) ?? 0
|
||||
const fileName = source.split(/[/\\]/).pop() ?? source
|
||||
const expectedChunks =
|
||||
fileSizeBytes > 0
|
||||
? await KbRatioRegistry.estimateChunks(fileName, fileSizeBytes)
|
||||
: null
|
||||
|
||||
const warnings = decideWarnings({ fileSizeBytes, chunksInQdrant, expectedChunks })
|
||||
if (warnings.length > 0) out[source] = warnings
|
||||
}
|
||||
|
||||
return out
|
||||
} catch (error) {
|
||||
logger.error('[RAG] Error computing file warnings:', error)
|
||||
return {}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all Qdrant points associated with a given source path and remove
|
||||
* the corresponding file from disk if it lives under the uploads directory.
|
||||
|
|
|
|||
70
admin/app/utils/kb_warning_decision.ts
Normal file
70
admin/app/utils/kb_warning_decision.ts
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
/**
|
||||
* Conditional warnings surfaced on Stored Files rows in the KB panel.
|
||||
* See RFC #883 §6 — these warnings appear ONLY when their triggering condition
|
||||
* is met, never on healthy files, to keep the panel silent in the common case.
|
||||
*
|
||||
* - `zero_chunks` — a non-trivial file produced 0 embedding chunks. Common
|
||||
* cause: video-only or image-only ZIMs that the pipeline
|
||||
* completes "successfully" with no extractable text.
|
||||
* AI Assistant cannot reference this content.
|
||||
* - `partial_stall` — the file has embedded chunks but well below the count
|
||||
* expected from the ratio registry. Likely a mid-batch
|
||||
* stall (which the binary "any chunks ⇒ embedded" check
|
||||
* used to mask). Surfaces a Retry affordance.
|
||||
*/
|
||||
export type FileWarning =
|
||||
| { kind: 'zero_chunks'; fileSizeBytes: number }
|
||||
| { kind: 'partial_stall'; chunksEmbedded: number; chunksExpected: number }
|
||||
|
||||
/** Files smaller than this are too small to flag as suspicious zero-chunk
|
||||
* cases — a 5 KB upload that produces 0 chunks is much more likely to be a
|
||||
* legitimate edge case (placeholder file) than the gigabyte-scale video ZIM
|
||||
* problem this warning targets. */
|
||||
export const ZERO_CHUNKS_MIN_SIZE_BYTES = 100 * 1024 * 1024 // 100 MB
|
||||
|
||||
/** Fraction of expected chunks below which we consider a file partially
|
||||
* stalled. 0.5 (50%) matches the threshold described in RFC #883 §6 Warning B. */
|
||||
export const PARTIAL_STALL_RATIO_THRESHOLD = 0.5
|
||||
|
||||
export interface WarningInputs {
|
||||
/** Source file size on disk in bytes. */
|
||||
fileSizeBytes: number
|
||||
/** Distinct chunks present in Qdrant for this source. */
|
||||
chunksInQdrant: number
|
||||
/** Best estimate of chunks the file should produce, from the ratio
|
||||
* registry. `null` when no registry pattern matches and no fallback is
|
||||
* configured — Warning B is suppressed in that case (we'd rather be silent
|
||||
* than wrong). */
|
||||
expectedChunks: number | null
|
||||
}
|
||||
|
||||
export function decideWarnings(inputs: WarningInputs): FileWarning[] {
|
||||
const warnings: FileWarning[] = []
|
||||
|
||||
// Warning A: file is large but produced nothing. Almost always a video-only
|
||||
// or image-only ZIM; AI Assistant literally cannot reference this content.
|
||||
if (
|
||||
inputs.chunksInQdrant === 0 &&
|
||||
inputs.fileSizeBytes > ZERO_CHUNKS_MIN_SIZE_BYTES
|
||||
) {
|
||||
warnings.push({ kind: 'zero_chunks', fileSizeBytes: inputs.fileSizeBytes })
|
||||
}
|
||||
|
||||
// Warning B: chunks present but far below expectation. Suppresses when we
|
||||
// have no expectation (registry miss) since the comparison would be
|
||||
// meaningless and we'd rather under-warn than mislead.
|
||||
if (
|
||||
inputs.expectedChunks !== null &&
|
||||
inputs.expectedChunks > 0 &&
|
||||
inputs.chunksInQdrant > 0 &&
|
||||
inputs.chunksInQdrant < inputs.expectedChunks * PARTIAL_STALL_RATIO_THRESHOLD
|
||||
) {
|
||||
warnings.push({
|
||||
kind: 'partial_stall',
|
||||
chunksEmbedded: inputs.chunksInQdrant,
|
||||
chunksExpected: inputs.expectedChunks,
|
||||
})
|
||||
}
|
||||
|
||||
return warnings
|
||||
}
|
||||
|
|
@ -51,6 +51,16 @@ export default function KnowledgeBaseModal({ aiAssistantName = "AI Assistant", o
|
|||
select: (data) => data || [],
|
||||
})
|
||||
|
||||
// Per-file conditional warnings (RFC #883 §6). Only sources with at least
|
||||
// one triggered warning are returned, so an empty map means everything is
|
||||
// healthy. Polled at the same idle cadence as health for low overhead.
|
||||
const { data: fileWarnings = {} } = useQuery({
|
||||
queryKey: ['kbFileWarnings'],
|
||||
queryFn: () => api.getKbFileWarnings(),
|
||||
select: (data) => data ?? {},
|
||||
refetchInterval: 30_000,
|
||||
})
|
||||
|
||||
// Global auto-index policy. KVStore returns `null` for an unset key, which
|
||||
// we treat as 'Always' for backward compatibility with installs that predate
|
||||
// this UI. The user can opt into Manual mode from the toggle below.
|
||||
|
|
@ -442,8 +452,34 @@ export default function KnowledgeBaseModal({ aiAssistantName = "AI Assistant", o
|
|||
accessor: 'source',
|
||||
title: 'File Name',
|
||||
render(record) {
|
||||
const warnings = fileWarnings[record.source] ?? []
|
||||
return (
|
||||
<span className="text-text-primary">{record.displayName}</span>
|
||||
<div className="flex flex-col gap-1">
|
||||
<span className="text-text-primary">
|
||||
{sourceToDisplayName(record.source)}
|
||||
</span>
|
||||
{warnings.map((w, i) => (
|
||||
<span
|
||||
key={i}
|
||||
className="inline-flex items-center gap-1.5 self-start text-xs text-amber-700 dark:text-amber-300 bg-amber-50 dark:bg-amber-950/40 border border-amber-200 dark:border-amber-800 rounded px-2 py-0.5"
|
||||
>
|
||||
<span aria-hidden="true">⚠</span>
|
||||
{w.kind === 'zero_chunks' && (
|
||||
<span>
|
||||
Embedded 0 chunks — this file has no text content.
|
||||
AI Assistant cannot reference it.
|
||||
</span>
|
||||
)}
|
||||
{w.kind === 'partial_stall' && (
|
||||
<span>
|
||||
Only {w.chunksEmbedded.toLocaleString()} of est.{' '}
|
||||
{w.chunksExpected.toLocaleString()} chunks embedded —
|
||||
ingestion may have stalled.
|
||||
</span>
|
||||
)}
|
||||
</span>
|
||||
))}
|
||||
</div>
|
||||
)
|
||||
},
|
||||
},
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import { CheckLatestVersionResult, SystemInformationResponse, SystemUpdateStatus
|
|||
import { DownloadJobWithProgress, WikipediaState } from '../../types/downloads'
|
||||
import type { Country, CountryCode, CountryGroup, MapExtractPreflight } from '../../types/maps'
|
||||
import { EmbedJobWithProgress } from '../../types/rag'
|
||||
import type { FileWarning } from '../../app/utils/kb_warning_decision.js'
|
||||
import type { CategoryWithStatus, CollectionWithStatus, ContentUpdateCheckResult, ResourceUpdateInfo } from '../../types/collections'
|
||||
import { catchInternal } from './util'
|
||||
import { NomadChatResponse, NomadInstalledModel, NomadOllamaModel, OllamaChatRequest } from '../../types/ollama'
|
||||
|
|
@ -475,6 +476,15 @@ class API {
|
|||
})()
|
||||
}
|
||||
|
||||
async getKbFileWarnings() {
|
||||
return catchInternal(async () => {
|
||||
const response = await this.client.get<{ warnings: Record<string, FileWarning[]> }>(
|
||||
'/rag/file-warnings'
|
||||
)
|
||||
return response.data.warnings
|
||||
})()
|
||||
}
|
||||
|
||||
async deleteRAGFile(source: string) {
|
||||
return catchInternal(async () => {
|
||||
const response = await this.client.delete<{ message: string }>('/rag/files', { data: { source } })
|
||||
|
|
|
|||
|
|
@ -141,6 +141,7 @@ router
|
|||
.group(() => {
|
||||
router.post('/upload', [RagController, 'upload'])
|
||||
router.get('/files', [RagController, 'getStoredFiles'])
|
||||
router.get('/file-warnings', [RagController, 'getFileWarnings'])
|
||||
router.delete('/files', [RagController, 'deleteFile'])
|
||||
router.get('/active-jobs', [RagController, 'getActiveJobs'])
|
||||
router.get('/failed-jobs', [RagController, 'getFailedJobs'])
|
||||
|
|
|
|||
125
admin/tests/unit/kb_warning_decision.spec.ts
Normal file
125
admin/tests/unit/kb_warning_decision.spec.ts
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
import * as assert from 'node:assert/strict'
|
||||
import { test } from 'node:test'
|
||||
|
||||
import { decideWarnings } from '../../app/utils/kb_warning_decision.js'
|
||||
|
||||
const MB = 1024 * 1024
|
||||
|
||||
test('healthy file: chunks present and on-target → no warnings', () => {
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 100 * MB,
|
||||
chunksInQdrant: 11_000,
|
||||
expectedChunks: 11_000,
|
||||
}),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('healthy file: chunks slightly above expectation → no warnings', () => {
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 100 * MB,
|
||||
chunksInQdrant: 12_000,
|
||||
expectedChunks: 11_000,
|
||||
}),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('Warning A: large file with 0 chunks (video-only ZIM)', () => {
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 5 * 1024 * MB,
|
||||
chunksInQdrant: 0,
|
||||
expectedChunks: 0,
|
||||
}),
|
||||
[{ kind: 'zero_chunks', fileSizeBytes: 5 * 1024 * MB }]
|
||||
)
|
||||
})
|
||||
|
||||
test('Warning A: small empty file is silently ignored (under 100 MB threshold)', () => {
|
||||
// A user uploads a 5 KB placeholder.txt that produces nothing → not worth a banner
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 5 * 1024, // 5 KB
|
||||
chunksInQdrant: 0,
|
||||
expectedChunks: null,
|
||||
}),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('Warning B: partial stall — chunks well below expectation', () => {
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 1000 * MB,
|
||||
chunksInQdrant: 266,
|
||||
expectedChunks: 600_000,
|
||||
}),
|
||||
[{ kind: 'partial_stall', chunksEmbedded: 266, chunksExpected: 600_000 }]
|
||||
)
|
||||
})
|
||||
|
||||
test('Warning B: chunks just under 50% of expected → triggers', () => {
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 100 * MB,
|
||||
chunksInQdrant: 4_999,
|
||||
expectedChunks: 10_000,
|
||||
}),
|
||||
[{ kind: 'partial_stall', chunksEmbedded: 4_999, chunksExpected: 10_000 }]
|
||||
)
|
||||
})
|
||||
|
||||
test('Warning B: chunks at exactly 50% of expected → does NOT trigger', () => {
|
||||
// Strict less-than threshold leaves room for the boundary
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 100 * MB,
|
||||
chunksInQdrant: 5_000,
|
||||
expectedChunks: 10_000,
|
||||
}),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('Warning B suppressed when expectedChunks is null (registry miss)', () => {
|
||||
// Better to be silent than show a meaningless "266 of unknown" comparison
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 100 * MB,
|
||||
chunksInQdrant: 266,
|
||||
expectedChunks: null,
|
||||
}),
|
||||
[]
|
||||
)
|
||||
})
|
||||
|
||||
test('Warning B suppressed when expectedChunks is 0 (video-only registry entry)', () => {
|
||||
// A `lrnselfreliance_` row in the registry says "expect 0 chunks". A real
|
||||
// file matching it correctly producing 0 chunks must not trigger Warning B.
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 500 * MB,
|
||||
chunksInQdrant: 0,
|
||||
expectedChunks: 0,
|
||||
}),
|
||||
// Note: Warning A triggers here because file > 100 MB and chunks = 0
|
||||
[{ kind: 'zero_chunks', fileSizeBytes: 500 * MB }]
|
||||
)
|
||||
})
|
||||
|
||||
test('Both warnings can fire on the same file in principle', () => {
|
||||
// Edge case: huge file, 0 chunks, but ratio registry expected 100k.
|
||||
// Warning A fires (large + zero), Warning B suppressed (chunksInQdrant must be > 0).
|
||||
// This documents the chunksInQdrant > 0 guard on Warning B.
|
||||
assert.deepEqual(
|
||||
decideWarnings({
|
||||
fileSizeBytes: 1000 * MB,
|
||||
chunksInQdrant: 0,
|
||||
expectedChunks: 100_000,
|
||||
}),
|
||||
[{ kind: 'zero_chunks', fileSizeBytes: 1000 * MB }]
|
||||
)
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user