mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-05-12 16:10:11 +02:00
fix(rag): repair ZIM embedding pipeline (sync filter, batch gate, DOM walk) (#745)
Three bugs in the RAG embedding pipeline, diagnosed and patched by @sbruschke against v1.31.0 with working before/after chunk counts. All three are root-cause contributors to #388. 1. scanAndSyncStorage queued every file under /storage/zim/ for embedding, including Kiwix's generated kiwix-library.xml. EmbedFileJob rejected it with "Unsupported file type" and the default 30-attempt retry policy kept it looping on every sync, flooding nomad_admin logs. Now gated on determineFileType(filePath) !== 'unknown'. 2. hasMoreBatches compared zimChunks.length (section-level chunk count under the 'structured' strategy) against ZIM_BATCH_SIZE (an article limit). Because articles emit multiple sections, the two are never equal for real archives and processing silently stopped after the first 50 articles. Now gated on articlesInBatch >= ZIM_BATCH_SIZE. 3. extractStructuredContent walked only direct children of <body>, so any ZIM that wraps content in a container div (Devdocs, Wikipedia, FreeCodeCamp, React docs, etc.) produced zero sections and silently embedded zero chunks while reporting success. Now walks the full DOM via $('body').find('h2, h3, h4, p, ul, ol, dl, table'), with a whole-body text fallback when the selector walk yields nothing. Before/after chunk counts confirmed by @sbruschke on v1.31.0: devdocs_en_git 0 -> 916 devdocs_en_react 0 -> 481 devdocs_en_node 0 -> 423 libretexts_en_eng 1 -> 35 (climbing) Wikipedia resumed progressing normally through its 6M articles. Closes #718 Closes #719 Closes #720 Closes #388 Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d22c0b202c
commit
9c98d8225b
|
|
@ -532,9 +532,12 @@ export class RagService {
|
|||
}
|
||||
}
|
||||
|
||||
// Count unique articles processed in this batch
|
||||
// Count unique articles processed in this batch. hasMoreBatches gates on the article
|
||||
// count — zimChunks.length counts section-level chunks (multiple per article under the
|
||||
// 'structured' strategy), so comparing it to ZIM_BATCH_SIZE (an article limit) caps
|
||||
// processing at the first batch for any real archive.
|
||||
const articlesInBatch = new Set(zimChunks.map((c) => c.documentId)).size
|
||||
const hasMoreBatches = zimChunks.length === ZIM_BATCH_SIZE
|
||||
const hasMoreBatches = articlesInBatch >= ZIM_BATCH_SIZE
|
||||
|
||||
logger.info(
|
||||
`[RAG] Successfully embedded ${totalChunks} total chunks from ${articlesInBatch} articles (hasMore: ${hasMoreBatches})`
|
||||
|
|
@ -1252,8 +1255,12 @@ export class RagService {
|
|||
|
||||
logger.info(`[RAG] Found ${sourcesInQdrant.size} unique sources in Qdrant`)
|
||||
|
||||
// Find files that are in storage but not in Qdrant
|
||||
const filesToEmbed = filesInStorage.filter((filePath) => !sourcesInQdrant.has(filePath))
|
||||
// Find files that are in storage, not already in Qdrant, and have an embeddable type.
|
||||
// Non-embeddable files (e.g. kiwix-library.xml in /storage/zim) would otherwise be
|
||||
// dispatched to EmbedFileJob, fail with "Unsupported file type", and retry on every sync.
|
||||
const filesToEmbed = filesInStorage.filter(
|
||||
(filePath) => !sourcesInQdrant.has(filePath) && determineFileType(filePath) !== 'unknown'
|
||||
)
|
||||
|
||||
logger.info(`[RAG] Found ${filesToEmbed.length} files that need embedding`)
|
||||
|
||||
|
|
|
|||
|
|
@ -216,7 +216,10 @@ export class ZIMExtractionService {
|
|||
const sections: Array<{ heading: string; text: string; level: number }> = [];
|
||||
let currentSection = { heading: 'Introduction', content: [] as string[], level: 2 };
|
||||
|
||||
$('body').children().each((_, element) => {
|
||||
// Walk the full DOM rather than only direct children of <body>. Modern ZIMs (Devdocs,
|
||||
// Wikipedia, FreeCodeCamp, etc.) wrap article content in a container div, which under
|
||||
// .children() would be a single non-heading/non-paragraph element and yield zero sections.
|
||||
$('body').find('h2, h3, h4, p, ul, ol, dl, table').each((_, element) => {
|
||||
const $el = $(element);
|
||||
const tagName = element.tagName?.toLowerCase();
|
||||
|
||||
|
|
@ -253,6 +256,20 @@ export class ZIMExtractionService {
|
|||
});
|
||||
}
|
||||
|
||||
// Fallback: if the selector walk produced no sections but the body has meaningful
|
||||
// text (unusual structure, minimal markup), emit one section with the full body text
|
||||
// so the article still contributes to the knowledge base.
|
||||
if (sections.length === 0) {
|
||||
const bodyText = $('body').text().replace(/\s+/g, ' ').trim();
|
||||
if (bodyText.length > 0) {
|
||||
sections.push({
|
||||
heading: title || 'Content',
|
||||
text: bodyText,
|
||||
level: 2,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
title,
|
||||
sections,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user