project-nomad/admin/app/services/zim_extraction_service.ts
chriscrosstalk 216509ae0d fix(rag): repair ZIM embedding pipeline (sync filter, batch gate, DOM walk) (#745)
Three bugs in the RAG embedding pipeline, diagnosed and patched by @sbruschke
against v1.31.0 with working before/after chunk counts. All three are
root-cause contributors to #388.

1. scanAndSyncStorage queued every file under /storage/zim/ for embedding,
   including Kiwix's generated kiwix-library.xml. EmbedFileJob rejected it
   with "Unsupported file type" and the default 30-attempt retry policy
   kept it looping on every sync, flooding nomad_admin logs. Now gated on
   determineFileType(filePath) !== 'unknown'.

2. hasMoreBatches compared zimChunks.length (section-level chunk count
   under the 'structured' strategy) against ZIM_BATCH_SIZE (an article
   limit). Because articles emit multiple sections, the two are never
   equal for real archives and processing silently stopped after the
   first 50 articles. Now gated on articlesInBatch >= ZIM_BATCH_SIZE.

3. extractStructuredContent walked only direct children of <body>, so any
   ZIM that wraps content in a container div (Devdocs, Wikipedia,
   FreeCodeCamp, React docs, etc.) produced zero sections and silently
   embedded zero chunks while reporting success. Now walks the full DOM
   via $('body').find('h2, h3, h4, p, ul, ol, dl, table'), with a
   whole-body text fallback when the selector walk yields nothing.

Before/after chunk counts confirmed by @sbruschke on v1.31.0:
  devdocs_en_git   0 -> 916
  devdocs_en_react 0 -> 481
  devdocs_en_node  0 -> 423
  libretexts_en_eng 1 -> 35 (climbing)
Wikipedia resumed progressing normally through its 6M articles.

Closes #718
Closes #719
Closes #720
Closes #388

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 14:26:28 -07:00

334 lines
14 KiB
TypeScript

import { Archive, Entry } from '@openzim/libzim'
import * as cheerio from 'cheerio'
import { HTML_SELECTORS_TO_REMOVE, NON_CONTENT_HEADING_PATTERNS } from '../../constants/zim_extraction.js'
import logger from '@adonisjs/core/services/logger'
import { ExtractZIMChunkingStrategy, ExtractZIMContentOptions, ZIMContentChunk, ZIMArchiveMetadata } from '../../types/zim.js'
import { randomUUID } from 'node:crypto'
import { access } from 'node:fs/promises'
import { isValidZimFile } from '../utils/fs.js'
export class ZIMExtractionService {
private extractArchiveMetadata(archive: Archive): ZIMArchiveMetadata {
try {
return {
title: archive.getMetadata('Title') || archive.getMetadata('Name') || 'Unknown',
creator: archive.getMetadata('Creator') || 'Unknown',
publisher: archive.getMetadata('Publisher') || 'Unknown',
date: archive.getMetadata('Date') || 'Unknown',
language: archive.getMetadata('Language') || 'Unknown',
description: archive.getMetadata('Description') || '',
}
} catch (error) {
logger.warn('[ZIMExtractionService]: Could not extract all metadata, using defaults', error)
return {
title: 'Unknown',
creator: 'Unknown',
publisher: 'Unknown',
date: 'Unknown',
language: 'Unknown',
description: '',
}
}
}
/**
* Breaks out a ZIM file's entries into their structured content form
* to facilitate better indexing and retrieval.
* Returns enhanced chunks with full article context and metadata.
*
* @param filePath - Path to the ZIM file
* @param opts - Options including maxArticles, strategy, onProgress, startOffset, and batchSize
*/
async extractZIMContent(filePath: string, opts: ExtractZIMContentOptions = {}): Promise<ZIMContentChunk[]> {
try {
logger.info(`[ZIMExtractionService]: Processing ZIM file at path: ${filePath}`)
// defensive - check if file still exists before opening
// could have been deleted by another process or batch
try {
await access(filePath)
} catch (error) {
logger.error(`[ZIMExtractionService]: ZIM file not accessible: ${filePath}`)
throw new Error(`ZIM file not found or not accessible: ${filePath}`)
}
// Validate ZIM magic number before opening with native library.
// A corrupted file causes a native C++ abort that cannot be caught by JS.
if (!(await isValidZimFile(filePath))) {
throw new Error(`ZIM file is invalid or corrupted: ${filePath}`)
}
const archive = new Archive(filePath)
// Extract archive-level metadata once
const archiveMetadata = this.extractArchiveMetadata(archive)
logger.info(`[ZIMExtractionService]: Archive metadata - Title: ${archiveMetadata.title}, Language: ${archiveMetadata.language}`)
let articlesProcessed = 0
let articlesSkipped = 0
const processedPaths = new Set<string>()
const toReturn: ZIMContentChunk[] = []
// Support batch processing to avoid lock timeouts on large ZIM files
const startOffset = opts.startOffset || 0
const batchSize = opts.batchSize || (opts.maxArticles || Infinity)
for (const entry of archive.iterByPath()) {
// Skip articles until we reach the start offset
if (articlesSkipped < startOffset) {
if (this.isArticleEntry(entry) && !processedPaths.has(entry.path)) {
articlesSkipped++
}
continue
}
if (articlesProcessed >= batchSize) {
break
}
if (!this.isArticleEntry(entry)) {
logger.debug(`[ZIMExtractionService]: Skipping non-article entry at path: ${entry.path}`)
continue
}
if (processedPaths.has(entry.path)) {
logger.debug(`[ZIMExtractionService]: Skipping duplicate entry at path: ${entry.path}`)
continue
}
processedPaths.add(entry.path)
const item = entry.item
const blob = item.data
const html = this.getCleanedHTMLString(blob.data)
const strategy = opts.strategy || this.chooseChunkingStrategy(html);
logger.debug(`[ZIMExtractionService]: Chosen chunking strategy for path ${entry.path}: ${strategy}`)
// Generate a unique document ID. All chunks from same article will share it
const documentId = randomUUID()
const articleTitle = entry.title || entry.path
let chunks: ZIMContentChunk[]
if (strategy === 'structured') {
const structured = this.extractStructuredContent(html)
chunks = structured.sections.map(s => ({
text: s.text,
articleTitle,
articlePath: entry.path,
sectionTitle: s.heading,
fullTitle: `${articleTitle} - ${s.heading}`,
hierarchy: `${articleTitle} > ${s.heading}`,
sectionLevel: s.level,
documentId,
archiveMetadata,
strategy,
}))
} else {
// Simple strategy - entire article as one chunk
const text = this.extractTextFromHTML(html) || ''
chunks = [{
text,
articleTitle,
articlePath: entry.path,
sectionTitle: articleTitle, // Same as article for simple strategy
fullTitle: articleTitle,
hierarchy: articleTitle,
documentId,
archiveMetadata,
strategy,
}]
}
logger.debug(`Extracted ${chunks.length} chunks from article at path: ${entry.path} using strategy: ${strategy}`)
const nonEmptyChunks = chunks.filter(c => c.text.trim().length > 0)
logger.debug(`After filtering empty chunks, ${nonEmptyChunks.length} chunks remain for article at path: ${entry.path}`)
toReturn.push(...nonEmptyChunks)
articlesProcessed++
if (opts.onProgress) {
opts.onProgress(articlesProcessed, archive.articleCount)
}
}
logger.info(`[ZIMExtractionService]: Completed processing ZIM file. Total articles processed: ${articlesProcessed}`)
logger.debug("Final structured content sample:", toReturn.slice(0, 3).map(c => ({
articleTitle: c.articleTitle,
sectionTitle: c.sectionTitle,
hierarchy: c.hierarchy,
textPreview: c.text.substring(0, 100)
})))
logger.debug("Total structured sections extracted:", toReturn.length)
return toReturn
} catch (error) {
logger.error('Error processing ZIM file:', error)
throw error
}
}
private chooseChunkingStrategy(html: string, options = {
forceStrategy: null as ExtractZIMChunkingStrategy | null,
}): ExtractZIMChunkingStrategy {
const {
forceStrategy = null,
} = options;
if (forceStrategy) return forceStrategy;
// Use a simple analysis to determin if the HTML has any meaningful structure
// that we can leverage for better chunking. If not, we'll just chunk it as one big piece of text.
return this.hasStructuredHeadings(html) ? 'structured' : 'simple';
}
private getCleanedHTMLString(buff: Buffer<ArrayBufferLike>): string {
const rawString = buff.toString('utf-8');
const $ = cheerio.load(rawString);
HTML_SELECTORS_TO_REMOVE.forEach((selector) => {
$(selector).remove()
});
return $.html();
}
private extractTextFromHTML(html: string): string | null {
try {
const $ = cheerio.load(html)
// Search body first, then root if body is absent
const text = $('body').length ? $('body').text() : $.root().text()
return text.replace(/\s+/g, ' ').replace(/\n\s*\n/g, '\n').trim()
} catch (error) {
logger.error('Error extracting text from HTML:', error)
return null
}
}
private extractStructuredContent(html: string) {
const $ = cheerio.load(html);
const title = $('h1').first().text().trim() || $('title').text().trim();
// Extract sections with their headings and heading levels
const sections: Array<{ heading: string; text: string; level: number }> = [];
let currentSection = { heading: 'Introduction', content: [] as string[], level: 2 };
// Walk the full DOM rather than only direct children of <body>. Modern ZIMs (Devdocs,
// Wikipedia, FreeCodeCamp, etc.) wrap article content in a container div, which under
// .children() would be a single non-heading/non-paragraph element and yield zero sections.
$('body').find('h2, h3, h4, p, ul, ol, dl, table').each((_, element) => {
const $el = $(element);
const tagName = element.tagName?.toLowerCase();
if (['h2', 'h3', 'h4'].includes(tagName)) {
// Save current section if it has content
if (currentSection.content.length > 0) {
sections.push({
heading: currentSection.heading,
text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
level: currentSection.level,
});
}
// Start new section
const level = parseInt(tagName.substring(1)); // Extract number from h2, h3, h4
currentSection = {
heading: $el.text().replace(/\[edit\]/gi, '').trim(),
content: [],
level,
};
} else if (['p', 'ul', 'ol', 'dl', 'table'].includes(tagName)) {
const text = $el.text().trim();
if (text.length > 0) {
currentSection.content.push(text);
}
}
});
// Push the last section if it has content
if (currentSection.content.length > 0) {
sections.push({
heading: currentSection.heading,
text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
level: currentSection.level,
});
}
// Fallback: if the selector walk produced no sections but the body has meaningful
// text (unusual structure, minimal markup), emit one section with the full body text
// so the article still contributes to the knowledge base.
if (sections.length === 0) {
const bodyText = $('body').text().replace(/\s+/g, ' ').trim();
if (bodyText.length > 0) {
sections.push({
heading: title || 'Content',
text: bodyText,
level: 2,
});
}
}
return {
title,
sections,
fullText: sections.map(s => `${s.heading}\n${s.text}`).join('\n\n'),
};
}
private hasStructuredHeadings(html: string): boolean {
const $ = cheerio.load(html);
const headings = $('h2, h3').toArray();
// Consider it structured if it has at least 2 headings to break content into meaningful sections
if (headings.length < 2) return false;
// Check that headings have substantial content between them
let sectionsWithContent = 0;
for (const heading of headings) {
const $heading = $(heading);
const headingText = $heading.text().trim();
// Skip empty or very short headings, likely not meaningful
if (headingText.length < 3) continue;
// Skip common non-content headings
if (NON_CONTENT_HEADING_PATTERNS.some(pattern => pattern.test(headingText))) {
continue;
}
// Content until next heading
let contentLength = 0;
let $next = $heading.next();
while ($next.length && !$next.is('h1, h2, h3, h4')) {
contentLength += $next.text().trim().length;
$next = $next.next();
}
// Consider it a real section if it has at least 100 chars of content
if (contentLength >= 100) {
sectionsWithContent++;
}
}
// Require at least 2 sections with substantial content
return sectionsWithContent >= 2;
}
private isArticleEntry(entry: Entry): boolean {
try {
if (entry.isRedirect) return false;
const item = entry.item;
const mimeType = item.mimetype;
return mimeType === 'text/html' || mimeType === 'application/xhtml+xml';
} catch {
return false;
}
}
}