feat: zim content embedding

This commit is contained in:
Jake Turner 2026-02-08 12:40:50 -08:00 committed by Jake Turner
parent c2b6e079af
commit 8726700a0a
12 changed files with 2289 additions and 204 deletions

View File

@ -5,7 +5,6 @@ import type { HttpContext } from '@adonisjs/core/http'
import app from '@adonisjs/core/services/app'
import { randomBytes } from 'node:crypto'
import { sanitizeFilename } from '../utils/fs.js'
import { stat } from 'node:fs/promises'
import { getJobStatusSchema } from '#validators/rag'
@inject()
@ -28,20 +27,10 @@ export default class RagController {
name: fileName,
})
// Get file size for tracking
let fileSize: number | undefined = undefined
try {
const stats = await stat(fullPath)
fileSize = stats.size
} catch (error) {
// Not critical if we can't get file size, just swallow the error
}
// Dispatch background job for embedding
const result = await EmbedFileJob.dispatch({
filePath: fullPath,
fileName,
fileSize,
})
return response.status(202).json({

View File

@ -10,6 +10,10 @@ export interface EmbedFileJobParams {
filePath: string
fileName: string
fileSize?: number
// Batch processing for large ZIM files
batchOffset?: number // Current batch offset (for ZIM files)
totalArticles?: number // Total articles in ZIM (for progress tracking)
isFinalBatch?: boolean // Whether this is the last batch (prevents premature deletion)
}
export class EmbedFileJob {
@ -26,9 +30,11 @@ export class EmbedFileJob {
}
async handle(job: Job) {
const { filePath, fileName } = job.data as EmbedFileJobParams
const { filePath, fileName, batchOffset, totalArticles } = job.data as EmbedFileJobParams
logger.info(`[EmbedFileJob] Starting embedding process for: ${fileName}`)
const isZimBatch = batchOffset !== undefined
const batchInfo = isZimBatch ? ` (batch offset: ${batchOffset})` : ''
logger.info(`[EmbedFileJob] Starting embedding process for: ${fileName}${batchInfo}`)
const dockerService = new DockerService()
const ollamaService = new OllamaService()
@ -55,30 +61,78 @@ export class EmbedFileJob {
await job.updateData({
...job.data,
status: 'processing',
startedAt: Date.now(),
startedAt: job.data.startedAt || Date.now(),
})
logger.info(`[EmbedFileJob] Processing file: ${filePath}`)
// Process and embed the file
const result = await ragService.processAndEmbedFile(filePath)
// Only allow deletion if explicitly marked as final batch
const allowDeletion = job.data.isFinalBatch === true
const result = await ragService.processAndEmbedFile(
filePath,
allowDeletion,
batchOffset
)
if (!result.success) {
logger.error(`[EmbedFileJob] Failed to process file ${fileName}: ${result.message}`)
throw new Error(result.message)
}
// Update progress complete
// For ZIM files with batching, check if more batches are needed
if (result.hasMoreBatches) {
const nextOffset = (batchOffset || 0) + (result.articlesProcessed || 0)
logger.info(
`[EmbedFileJob] Batch complete. Dispatching next batch at offset ${nextOffset}`
)
// Dispatch next batch (not final yet)
await EmbedFileJob.dispatch({
filePath,
fileName,
batchOffset: nextOffset,
totalArticles: totalArticles || result.totalArticles,
isFinalBatch: false, // Explicitly not final
})
// Calculate progress based on articles processed
const progress = totalArticles
? Math.round((nextOffset / totalArticles) * 100)
: 50
await job.updateProgress(progress)
await job.updateData({
...job.data,
status: 'batch_completed',
lastBatchAt: Date.now(),
chunks: (job.data.chunks || 0) + (result.chunks || 0),
})
return {
success: true,
fileName,
filePath,
chunks: result.chunks,
hasMoreBatches: true,
nextOffset,
message: `Batch embedded ${result.chunks} chunks, next batch queued`,
}
}
// Final batch or non-batched file - mark as complete
const totalChunks = (job.data.chunks || 0) + (result.chunks || 0)
await job.updateProgress(100)
await job.updateData({
...job.data,
status: 'completed',
completedAt: Date.now(),
chunks: result.chunks,
chunks: totalChunks,
})
const batchMsg = isZimBatch ? ` (final batch, total chunks: ${totalChunks})` : ''
logger.info(
`[EmbedFileJob] Successfully embedded ${result.chunks} chunks from file: ${fileName}`
`[EmbedFileJob] Successfully embedded ${result.chunks} chunks from file: ${fileName}${batchMsg}`
)
return {

View File

@ -6,6 +6,7 @@ import { createHash } from 'crypto'
import { DockerService } from '#services/docker_service'
import { ZimService } from '#services/zim_service'
import { MapService } from '#services/map_service'
import { EmbedFileJob } from './embed_file_job.js'
export class RunDownloadJob {
static get queue() {
@ -24,17 +25,6 @@ export class RunDownloadJob {
const { url, filepath, timeout, allowedMimeTypes, forceNew, filetype } =
job.data as RunDownloadJobParams
// console.log("Simulating delay for job for URL:", url)
// await new Promise((resolve) => setTimeout(resolve, 30000)) // Simulate initial delay
// console.log("Starting download for URL:", url)
// // simulate progress updates for demonstration
// for (let progress = 0; progress <= 100; progress += 10) {
// await new Promise((resolve) => setTimeout(resolve, 20000)) // Simulate time taken for each progress step
// job.updateProgress(progress)
// console.log(`Job progress for URL ${url}: ${progress}%`)
// }
await doResumableDownload({
url,
filepath,
@ -51,6 +41,16 @@ export class RunDownloadJob {
const dockerService = new DockerService()
const zimService = new ZimService(dockerService)
await zimService.downloadRemoteSuccessCallback([url], true)
// Dispatch an embedding job for the downloaded ZIM file
try {
await EmbedFileJob.dispatch({
fileName: url.split('/').pop() || '',
filePath: filepath,
})
} catch (error) {
console.error(`[RunDownloadJob] Error dispatching EmbedFileJob for URL ${url}:`, error)
}
} else if (filetype === 'map') {
const mapsService = new MapService()
await mapsService.downloadRemoteSuccessCallback([url], false)

View File

@ -3,6 +3,7 @@ import { streamToString } from '../../util/docs.js'
import { getFile, getFileStatsIfExists, listDirectoryContentsRecursive } from '../utils/fs.js'
import path from 'path'
import InternalServerErrorException from '#exceptions/internal_server_error_exception'
import logger from '@adonisjs/core/services/logger'
export class DocsService {
private docsPath = path.join(process.cwd(), 'docs')
@ -46,13 +47,13 @@ export class DocsService {
// Filter out attribute-undefined errors which may be caused by emojis and special characters
const criticalErrors = errors.filter((e) => e.error.id !== 'attribute-undefined')
if (criticalErrors.length > 0) {
console.error('Markdoc validation errors:', errors.map((e) => JSON.stringify(e.error)).join(', '))
logger.error('Markdoc validation errors:', errors.map((e) => JSON.stringify(e.error)).join(', '))
throw new Error('Markdoc validation failed')
}
return Markdoc.transform(ast, config)
} catch (error) {
console.log('Error parsing Markdoc content:', error)
logger.error('Error parsing Markdoc content:', error)
throw new InternalServerErrorException(`Error parsing content: ${(error as Error).message}`)
}
}

View File

@ -15,6 +15,8 @@ import { randomUUID } from 'node:crypto'
import { join } from 'node:path'
import KVStore from '#models/kv_store'
import { parseBoolean } from '../utils/misc.js'
import { ZIMExtractionService } from './zim_extraction_service.js'
import { ZIM_BATCH_SIZE } from '../../constants/zim_extraction.js'
@inject()
export class RagService {
@ -38,6 +40,67 @@ export class RagService {
private ollamaService: OllamaService
) { }
private async _initializeQdrantClient() {
if (!this.qdrantInitPromise) {
this.qdrantInitPromise = (async () => {
const qdrantUrl = await this.dockerService.getServiceURL(SERVICE_NAMES.QDRANT)
if (!qdrantUrl) {
throw new Error('Qdrant service is not installed or running.')
}
this.qdrant = new QdrantClient({ url: qdrantUrl })
})()
}
return this.qdrantInitPromise
}
private async _ensureDependencies() {
if (!this.qdrant) {
await this._initializeQdrantClient()
}
}
private async _ensureCollection(
collectionName: string,
dimensions: number = RagService.EMBEDDING_DIMENSION
) {
try {
await this._ensureDependencies()
const collections = await this.qdrant!.getCollections()
const collectionExists = collections.collections.some((col) => col.name === collectionName)
if (!collectionExists) {
await this.qdrant!.createCollection(collectionName, {
vectors: {
size: dimensions,
distance: 'Cosine',
},
})
}
} catch (error) {
logger.error('Error ensuring Qdrant collection:', error)
throw error
}
}
/**
* Sanitizes text to ensure it's safe for JSON encoding and Qdrant storage.
* Removes problematic characters that can cause "unexpected end of hex escape" errors:
* - Null bytes (\x00)
* - Invalid Unicode sequences
* - Control characters (except newlines, tabs, and carriage returns)
*/
private sanitizeText(text: string): string {
return text
// Null bytes
.replace(/\x00/g, '')
// Problematic control characters (keep \n, \r, \t)
.replace(/[\x01-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, '')
// Invalid Unicode surrogates
.replace(/[\uD800-\uDFFF]/g, '')
// Trim extra whitespace
.trim()
}
/**
* Estimates token count for text. This is a conservative approximation:
* - English text: ~1 token per 3 characters
@ -114,48 +177,6 @@ export class RagService {
return [...new Set(keywords)]
}
private async _initializeQdrantClient() {
if (!this.qdrantInitPromise) {
this.qdrantInitPromise = (async () => {
const qdrantUrl = await this.dockerService.getServiceURL(SERVICE_NAMES.QDRANT)
if (!qdrantUrl) {
throw new Error('Qdrant service is not installed or running.')
}
this.qdrant = new QdrantClient({ url: qdrantUrl })
})()
}
return this.qdrantInitPromise
}
private async _ensureDependencies() {
if (!this.qdrant) {
await this._initializeQdrantClient()
}
}
private async _ensureCollection(
collectionName: string,
dimensions: number = RagService.EMBEDDING_DIMENSION
) {
try {
await this._ensureDependencies()
const collections = await this.qdrant!.getCollections()
const collectionExists = collections.collections.some((col) => col.name === collectionName)
if (!collectionExists) {
await this.qdrant!.createCollection(collectionName, {
vectors: {
size: dimensions,
distance: 'Cosine',
},
})
}
} catch (error) {
logger.error('Error ensuring Qdrant collection:', error)
throw error
}
}
public async embedAndStoreText(
text: string,
metadata: Record<string, any> = {}
@ -237,21 +258,45 @@ export class RagService {
const timestamp = Date.now()
const points = chunks.map((chunkText, index) => {
// Extract keywords for hybrid search
const keywords = this.extractKeywords(chunkText)
logger.debug(`[RAG] Extracted keywords for chunk ${index}: [${keywords.join(', ')}]`)
// Sanitize text to prevent JSON encoding errors
const sanitizedText = this.sanitizeText(chunkText)
// Extract keywords from content
const contentKeywords = this.extractKeywords(sanitizedText)
// For ZIM content, also extract keywords from structural metadata
let structuralKeywords: string[] = []
if (metadata.full_title) {
structuralKeywords = this.extractKeywords(metadata.full_title as string)
} else if (metadata.article_title) {
structuralKeywords = this.extractKeywords(metadata.article_title as string)
}
// Combine and dedup keywords
const allKeywords = [...new Set([...structuralKeywords, ...contentKeywords])]
logger.debug(`[RAG] Extracted keywords for chunk ${index}: [${allKeywords.join(', ')}]`)
if (structuralKeywords.length > 0) {
logger.debug(`[RAG] - Structural: [${structuralKeywords.join(', ')}], Content: [${contentKeywords.join(', ')}]`)
}
// Sanitize source metadata as well
const sanitizedSource = typeof metadata.source === 'string'
? this.sanitizeText(metadata.source)
: 'unknown'
return {
id: randomUUID(), // qdrant requires either uuid or unsigned int
vector: embeddings[index],
payload: {
...metadata,
text: chunkText,
text: sanitizedText,
chunk_index: index,
total_chunks: chunks.length,
keywords: keywords.join(' '), // Store as space-separated string for text search
char_count: chunkText.length,
keywords: allKeywords.join(' '), // store as space-separated string for text search
char_count: sanitizedText.length,
created_at: timestamp,
source: metadata.source || 'unknown'
source: sanitizedSource
},
}
})
@ -269,12 +314,6 @@ export class RagService {
}
}
/**
* Preprocess an image to enhance text extraction quality.
* Normalizes, grayscales, sharpens, and resizes the image to a manageable size.
* @param filebuffer Buffer of the image file
* @returns - Processed image buffer
*/
private async preprocessImage(filebuffer: Buffer): Promise<Buffer> {
return await sharp(filebuffer)
.grayscale()
@ -284,12 +323,6 @@ export class RagService {
.toBuffer()
}
/**
* If the original PDF has little to no extractable text,
* we can use this method to convert each page to an image for OCR processing.
* @param filebuffer - Buffer of the PDF file
* @returns - Array of image buffers, one per page
*/
private async convertPDFtoImages(filebuffer: Buffer): Promise<Buffer[]> {
const converted = await fromBuffer(filebuffer, {
quality: 50,
@ -301,11 +334,6 @@ export class RagService {
return converted.filter((res) => res.buffer).map((res) => res.buffer!)
}
/**
* Extract text from a PDF file using pdf-parse.
* @param filebuffer - Buffer of the PDF file
* @returns - Extracted text
*/
private async extractPDFText(filebuffer: Buffer): Promise<string> {
const parser = new PDFParse({ data: filebuffer })
const data = await parser.getText()
@ -313,20 +341,10 @@ export class RagService {
return data.text
}
/**
* Extract text from a plain text file.
* @param filebuffer - Buffer of the text file
* @returns - Extracted text
*/
private async extractTXTText(filebuffer: Buffer): Promise<string> {
return filebuffer.toString('utf-8')
}
/**
* Extract text from an image file using Tesseract.js OCR.
* @param filebuffer - Buffer of the image file
* @returns - Extracted text
*/
private async extractImageText(filebuffer: Buffer): Promise<string> {
const worker = await createWorker('eng')
const result = await worker.recognize(filebuffer)
@ -334,71 +352,229 @@ export class RagService {
return result.data.text
}
private async processImageFile(fileBuffer: Buffer): Promise<string> {
const preprocessedBuffer = await this.preprocessImage(fileBuffer)
return await this.extractImageText(preprocessedBuffer)
}
/**
* Will process the PDF and attempt to extract text.
* If the extracted text is minimal, it will fallback to OCR on each page.
*/
private async processPDFFile(fileBuffer: Buffer): Promise<string> {
let extractedText = await this.extractPDFText(fileBuffer)
// Check if there was no extracted text or it was very minimal
if (!extractedText || extractedText.trim().length < 100) {
logger.debug('[RAG] PDF text extraction minimal, attempting OCR on pages')
// Convert PDF pages to images for OCR if text extraction was poor
const imageBuffers = await this.convertPDFtoImages(fileBuffer)
extractedText = ''
for (const imgBuffer of imageBuffers) {
const preprocessedImg = await this.preprocessImage(imgBuffer)
const pageText = await this.extractImageText(preprocessedImg)
extractedText += pageText + '\n'
}
}
return extractedText
}
/**
* Process a ZIM file: extract content with metadata and embed each chunk.
* Returns early with complete result since ZIM processing is self-contained.
* Supports batch processing to prevent lock timeouts on large ZIM files.
*/
private async processZIMFile(
filepath: string,
deleteAfterEmbedding: boolean,
batchOffset?: number
): Promise<{
success: boolean
message: string
chunks?: number
hasMoreBatches?: boolean
articlesProcessed?: number
totalArticles?: number
}> {
const zimExtractionService = new ZIMExtractionService()
// Process in batches to avoid lock timeout
const startOffset = batchOffset || 0
logger.info(
`[RAG] Extracting ZIM content (batch: offset=${startOffset}, size=${ZIM_BATCH_SIZE})`
)
const zimChunks = await zimExtractionService.extractZIMContent(filepath, {
startOffset,
batchSize: ZIM_BATCH_SIZE,
})
logger.info(
`[RAG] Extracted ${zimChunks.length} chunks from ZIM file with enhanced metadata`
)
// Process each chunk individually with its metadata
let totalChunks = 0
for (const zimChunk of zimChunks) {
const result = await this.embedAndStoreText(zimChunk.text, {
source: filepath,
content_type: 'zim_article',
// Article-level context
article_title: zimChunk.articleTitle,
article_path: zimChunk.articlePath,
// Section-level context
section_title: zimChunk.sectionTitle,
full_title: zimChunk.fullTitle,
hierarchy: zimChunk.hierarchy,
section_level: zimChunk.sectionLevel,
// Use the same document ID for all chunks from the same article for grouping in search results
document_id: zimChunk.documentId,
// Archive metadata
archive_title: zimChunk.archiveMetadata.title,
archive_creator: zimChunk.archiveMetadata.creator,
archive_publisher: zimChunk.archiveMetadata.publisher,
archive_date: zimChunk.archiveMetadata.date,
archive_language: zimChunk.archiveMetadata.language,
archive_description: zimChunk.archiveMetadata.description,
// Extraction metadata - not overly relevant for search, but could be useful for debugging and future features...
extraction_strategy: zimChunk.strategy,
})
if (result) {
totalChunks += result.chunks
}
}
// Count unique articles processed in this batch
const articlesInBatch = new Set(zimChunks.map((c) => c.documentId)).size
const hasMoreBatches = zimChunks.length === ZIM_BATCH_SIZE
logger.info(
`[RAG] Successfully embedded ${totalChunks} total chunks from ${articlesInBatch} articles (hasMore: ${hasMoreBatches})`
)
// Only delete the file when:
// 1. deleteAfterEmbedding is true (caller wants deletion)
// 2. No more batches remain (this is the final batch)
// This prevents race conditions where early batches complete after later ones
const shouldDelete = deleteAfterEmbedding && !hasMoreBatches
if (shouldDelete) {
logger.info(`[RAG] Final batch complete, deleting ZIM file: ${filepath}`)
await deleteFileIfExists(filepath)
} else if (!hasMoreBatches) {
logger.info(`[RAG] Final batch complete, but file deletion was not requested`)
}
return {
success: true,
message: hasMoreBatches
? 'ZIM batch processed successfully. More batches remain.'
: 'ZIM file processed and embedded successfully with enhanced metadata.',
chunks: totalChunks,
hasMoreBatches,
articlesProcessed: articlesInBatch,
}
}
private async processTextFile(fileBuffer: Buffer): Promise<string> {
return await this.extractTXTText(fileBuffer)
}
private async embedTextAndCleanup(
extractedText: string,
filepath: string,
deleteAfterEmbedding: boolean = false
): Promise<{ success: boolean; message: string; chunks?: number }> {
if (!extractedText || extractedText.trim().length === 0) {
return { success: false, message: 'Process completed succesfully, but no text was found to embed.' }
}
const embedResult = await this.embedAndStoreText(extractedText, {
source: filepath
})
if (!embedResult) {
return { success: false, message: 'Failed to embed and store the extracted text.' }
}
if (deleteAfterEmbedding) {
logger.info(`[RAG] Embedding complete, deleting uploaded file: ${filepath}`)
await deleteFileIfExists(filepath)
}
return {
success: true,
message: 'File processed and embedded successfully.',
chunks: embedResult.chunks,
}
}
/**
* Main pipeline to process and embed an uploaded file into the RAG knowledge base.
* This includes text extraction, chunking, embedding, and storing in Qdrant.
*
* Orchestrates file type detection and delegates to specialized processors.
* For ZIM files, supports batch processing via batchOffset parameter.
*/
public async processAndEmbedFile(
filepath: string, // Should already be the full path to the uploaded file
deleteAfterEmbedding: boolean = false
): Promise<{ success: boolean; message: string; chunks?: number }> {
filepath: string,
deleteAfterEmbedding: boolean = false,
batchOffset?: number
): Promise<{
success: boolean
message: string
chunks?: number
hasMoreBatches?: boolean
articlesProcessed?: number
totalArticles?: number
}> {
try {
const fileType = determineFileType(filepath)
logger.debug(`[RAG] Processing file: ${filepath} (detected type: ${fileType})`)
if (fileType === 'unknown') {
return { success: false, message: 'Unsupported file type.' }
}
const origFileBuffer = await getFile(filepath, 'buffer')
if (!origFileBuffer) {
// Read file buffer (not needed for ZIM as it reads directly)
const fileBuffer = fileType !== 'zim' ? await getFile(filepath, 'buffer') : null
if (fileType !== 'zim' && !fileBuffer) {
return { success: false, message: 'Failed to read the uploaded file.' }
}
let extractedText = ''
if (fileType === 'image') {
const preprocessedBuffer = await this.preprocessImage(origFileBuffer)
extractedText = await this.extractImageText(preprocessedBuffer)
} else if (fileType === 'pdf') {
extractedText = await this.extractPDFText(origFileBuffer)
// Check if there was no extracted text or it was very minimal
if (!extractedText || extractedText.trim().length < 100) {
// Convert PDF pages to images for OCR
const imageBuffers = await this.convertPDFtoImages(origFileBuffer)
for (const imgBuffer of imageBuffers) {
const preprocessedImg = await this.preprocessImage(imgBuffer)
const pageText = await this.extractImageText(preprocessedImg)
extractedText += pageText + '\n'
}
}
} else {
extractedText = await this.extractTXTText(origFileBuffer)
// Process based on file type
// ZIM files are handled specially since they have their own embedding workflow
if (fileType === 'zim') {
return await this.processZIMFile(filepath, deleteAfterEmbedding, batchOffset)
}
if (!extractedText || extractedText.trim().length === 0) {
return { success: false, message: 'No text could be extracted from the file.' }
// Extract text based on file type
let extractedText: string
switch (fileType) {
case 'image':
extractedText = await this.processImageFile(fileBuffer!)
break
case 'pdf':
extractedText = await this.processPDFFile(fileBuffer!)
break
case 'text':
default:
extractedText = await this.processTextFile(fileBuffer!)
break
}
const embedResult = await this.embedAndStoreText(extractedText, {
source: filepath
})
if (!embedResult) {
return { success: false, message: 'Failed to embed and store the extracted text.' }
}
if (deleteAfterEmbedding) {
// Cleanup the file from disk
logger.info(`[RAG] Embedding complete, deleting uploaded file: ${filepath}`)
await deleteFileIfExists(filepath)
}
return {
success: true,
message: 'File processed and embedded successfully.',
chunks: embedResult?.chunks,
}
// Embed extracted text and cleanup
return await this.embedTextAndCleanup(extractedText, filepath, deleteAfterEmbedding)
} catch (error) {
logger.error('Error processing and embedding file:', error)
logger.error('[RAG] Error processing and embedding file:', error)
return { success: false, message: 'Error processing and embedding file.' }
}
}
@ -497,6 +673,13 @@ export class RagService {
keywords: (result.payload?.keywords as string) || '',
chunk_index: (result.payload?.chunk_index as number) || 0,
created_at: (result.payload?.created_at as number) || 0,
// Enhanced ZIM metadata (likely be undefined for non-ZIM content)
article_title: result.payload?.article_title as string | undefined,
section_title: result.payload?.section_title as string | undefined,
full_title: result.payload?.full_title as string | undefined,
hierarchy: result.payload?.hierarchy as string | undefined,
document_id: result.payload?.document_id as string | undefined,
content_type: result.payload?.content_type as string | undefined,
}))
const rerankedResults = this.rerankResults(resultsWithMetadata, keywords, query)
@ -508,7 +691,7 @@ export class RagService {
)
})
// Return top N results
// Return top N results with enhanced metadata
return rerankedResults.slice(0, limit).map((result) => ({
text: result.text,
score: result.finalScore,
@ -516,6 +699,13 @@ export class RagService {
chunk_index: result.chunk_index,
created_at: result.created_at,
semantic_score: result.score,
// Enhanced ZIM metadata (likely be undefined for non-ZIM content)
article_title: result.article_title,
section_title: result.section_title,
full_title: result.full_title,
hierarchy: result.hierarchy,
document_id: result.document_id,
content_type: result.content_type,
},
}))
} catch (error) {
@ -544,6 +734,12 @@ export class RagService {
keywords: string
chunk_index: number
created_at: number
article_title?: string
section_title?: string
full_title?: string
hierarchy?: string
document_id?: string
content_type?: string
}>,
queryKeywords: string[],
originalQuery: string
@ -553,6 +749,12 @@ export class RagService {
finalScore: number
chunk_index: number
created_at: number
article_title?: string
section_title?: string
full_title?: string
hierarchy?: string
document_id?: string
content_type?: string
}> {
return results
.map((result) => {
@ -711,11 +913,9 @@ export class RagService {
for (const fileInfo of filesToEmbed) {
try {
logger.info(`[RAG] Dispatching embed job for: ${fileInfo.source}`)
const stats = await getFileStatsIfExists(fileInfo.path)
await EmbedFileJob.dispatch({
filePath: fileInfo.path,
fileName: fileInfo.source,
fileSize: stats?.size,
})
logger.info(`[RAG] Successfully dispatched job for ${fileInfo.source}`)
} catch (fileError) {

View File

@ -0,0 +1,310 @@
import { Archive, Entry } from '@openzim/libzim'
import * as cheerio from 'cheerio'
import { HTML_SELECTORS_TO_REMOVE, NON_CONTENT_HEADING_PATTERNS } from '../../constants/zim_extraction.js'
import logger from '@adonisjs/core/services/logger'
import { ExtractZIMChunkingStrategy, ExtractZIMContentOptions, ZIMContentChunk, ZIMArchiveMetadata } from '../../types/zim.js'
import { randomUUID } from 'node:crypto'
import { access } from 'node:fs/promises'
export class ZIMExtractionService {
private extractArchiveMetadata(archive: Archive): ZIMArchiveMetadata {
try {
return {
title: archive.getMetadata('Title') || archive.getMetadata('Name') || 'Unknown',
creator: archive.getMetadata('Creator') || 'Unknown',
publisher: archive.getMetadata('Publisher') || 'Unknown',
date: archive.getMetadata('Date') || 'Unknown',
language: archive.getMetadata('Language') || 'Unknown',
description: archive.getMetadata('Description') || '',
}
} catch (error) {
logger.warn('[ZIMExtractionService]: Could not extract all metadata, using defaults', error)
return {
title: 'Unknown',
creator: 'Unknown',
publisher: 'Unknown',
date: 'Unknown',
language: 'Unknown',
description: '',
}
}
}
/**
* Breaks out a ZIM file's entries into their structured content form
* to facilitate better indexing and retrieval.
* Returns enhanced chunks with full article context and metadata.
*
* @param filePath - Path to the ZIM file
* @param opts - Options including maxArticles, strategy, onProgress, startOffset, and batchSize
*/
async extractZIMContent(filePath: string, opts: ExtractZIMContentOptions = {}): Promise<ZIMContentChunk[]> {
try {
logger.info(`[ZIMExtractionService]: Processing ZIM file at path: ${filePath}`)
// defensive - check if file still exists before opening
// could have been deleted by another process or batch
try {
await access(filePath)
} catch (error) {
logger.error(`[ZIMExtractionService]: ZIM file not accessible: ${filePath}`)
throw new Error(`ZIM file not found or not accessible: ${filePath}`)
}
const archive = new Archive(filePath)
// Extract archive-level metadata once
const archiveMetadata = this.extractArchiveMetadata(archive)
logger.info(`[ZIMExtractionService]: Archive metadata - Title: ${archiveMetadata.title}, Language: ${archiveMetadata.language}`)
let articlesProcessed = 0
let articlesSkipped = 0
const processedPaths = new Set<string>()
const toReturn: ZIMContentChunk[] = []
// Support batch processing to avoid lock timeouts on large ZIM files
const startOffset = opts.startOffset || 0
const batchSize = opts.batchSize || (opts.maxArticles || Infinity)
for (const entry of archive.iterByPath()) {
// Skip articles until we reach the start offset
if (articlesSkipped < startOffset) {
if (this.isArticleEntry(entry) && !processedPaths.has(entry.path)) {
articlesSkipped++
}
continue
}
if (articlesProcessed >= batchSize) {
break
}
if (!this.isArticleEntry(entry)) {
logger.debug(`[ZIMExtractionService]: Skipping non-article entry at path: ${entry.path}`)
continue
}
if (processedPaths.has(entry.path)) {
logger.debug(`[ZIMExtractionService]: Skipping duplicate entry at path: ${entry.path}`)
continue
}
processedPaths.add(entry.path)
const item = entry.item
const blob = item.data
const html = this.getCleanedHTMLString(blob.data)
const strategy = opts.strategy || this.chooseChunkingStrategy(html);
logger.debug(`[ZIMExtractionService]: Chosen chunking strategy for path ${entry.path}: ${strategy}`)
// Generate a unique document ID. All chunks from same article will share it
const documentId = randomUUID()
const articleTitle = entry.title || entry.path
let chunks: ZIMContentChunk[]
if (strategy === 'structured') {
const structured = this.extractStructuredContent(html)
chunks = structured.sections.map(s => ({
text: s.text,
articleTitle,
articlePath: entry.path,
sectionTitle: s.heading,
fullTitle: `${articleTitle} - ${s.heading}`,
hierarchy: `${articleTitle} > ${s.heading}`,
sectionLevel: s.level,
documentId,
archiveMetadata,
strategy,
}))
} else {
// Simple strategy - entire article as one chunk
const text = this.extractTextFromHTML(html) || ''
chunks = [{
text,
articleTitle,
articlePath: entry.path,
sectionTitle: articleTitle, // Same as article for simple strategy
fullTitle: articleTitle,
hierarchy: articleTitle,
documentId,
archiveMetadata,
strategy,
}]
}
logger.debug(`Extracted ${chunks.length} chunks from article at path: ${entry.path} using strategy: ${strategy}`)
const nonEmptyChunks = chunks.filter(c => c.text.trim().length > 0)
logger.debug(`After filtering empty chunks, ${nonEmptyChunks.length} chunks remain for article at path: ${entry.path}`)
toReturn.push(...nonEmptyChunks)
articlesProcessed++
if (opts.onProgress) {
opts.onProgress(articlesProcessed, archive.articleCount)
}
}
logger.info(`[ZIMExtractionService]: Completed processing ZIM file. Total articles processed: ${articlesProcessed}`)
logger.debug("Final structured content sample:", toReturn.slice(0, 3).map(c => ({
articleTitle: c.articleTitle,
sectionTitle: c.sectionTitle,
hierarchy: c.hierarchy,
textPreview: c.text.substring(0, 100)
})))
logger.debug("Total structured sections extracted:", toReturn.length)
return toReturn
} catch (error) {
logger.error('Error processing ZIM file:', error)
throw error
}
}
private chooseChunkingStrategy(html: string, options = {
forceStrategy: null as ExtractZIMChunkingStrategy | null,
}): ExtractZIMChunkingStrategy {
const {
forceStrategy = null,
} = options;
if (forceStrategy) return forceStrategy;
// Use a simple analysis to determin if the HTML has any meaningful structure
// that we can leverage for better chunking. If not, we'll just chunk it as one big piece of text.
return this.hasStructuredHeadings(html) ? 'structured' : 'simple';
}
private getCleanedHTMLString(buff: Buffer<ArrayBufferLike>): string {
const rawString = buff.toString('utf-8');
const $ = cheerio.load(rawString);
HTML_SELECTORS_TO_REMOVE.forEach((selector) => {
$(selector).remove()
});
return $.html();
}
private extractTextFromHTML(html: string): string | null {
try {
const $ = cheerio.load(html)
// Search body first, then root if body is absent
const text = $('body').length ? $('body').text() : $.root().text()
return text.replace(/\s+/g, ' ').replace(/\n\s*\n/g, '\n').trim()
} catch (error) {
logger.error('Error extracting text from HTML:', error)
return null
}
}
private extractStructuredContent(html: string) {
const $ = cheerio.load(html);
const title = $('h1').first().text().trim() || $('title').text().trim();
// Extract sections with their headings and heading levels
const sections: Array<{ heading: string; text: string; level: number }> = [];
let currentSection = { heading: 'Introduction', content: [] as string[], level: 2 };
$('body').children().each((_, element) => {
const $el = $(element);
const tagName = element.tagName?.toLowerCase();
if (['h2', 'h3', 'h4'].includes(tagName)) {
// Save current section if it has content
if (currentSection.content.length > 0) {
sections.push({
heading: currentSection.heading,
text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
level: currentSection.level,
});
}
// Start new section
const level = parseInt(tagName.substring(1)); // Extract number from h2, h3, h4
currentSection = {
heading: $el.text().replace(/\[edit\]/gi, '').trim(),
content: [],
level,
};
} else if (['p', 'ul', 'ol', 'dl', 'table'].includes(tagName)) {
const text = $el.text().trim();
if (text.length > 0) {
currentSection.content.push(text);
}
}
});
// Push the last section if it has content
if (currentSection.content.length > 0) {
sections.push({
heading: currentSection.heading,
text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
level: currentSection.level,
});
}
return {
title,
sections,
fullText: sections.map(s => `${s.heading}\n${s.text}`).join('\n\n'),
};
}
private hasStructuredHeadings(html: string): boolean {
const $ = cheerio.load(html);
const headings = $('h2, h3').toArray();
// Consider it structured if it has at least 2 headings to break content into meaningful sections
if (headings.length < 2) return false;
// Check that headings have substantial content between them
let sectionsWithContent = 0;
for (const heading of headings) {
const $heading = $(heading);
const headingText = $heading.text().trim();
// Skip empty or very short headings, likely not meaningful
if (headingText.length < 3) continue;
// Skip common non-content headings
if (NON_CONTENT_HEADING_PATTERNS.some(pattern => pattern.test(headingText))) {
continue;
}
// Content until next heading
let contentLength = 0;
let $next = $heading.next();
while ($next.length && !$next.is('h1, h2, h3, h4')) {
contentLength += $next.text().trim().length;
$next = $next.next();
}
// Consider it a real section if it has at least 100 chars of content
if (contentLength >= 100) {
sectionsWithContent++;
}
}
// Require at least 2 sections with substantial content
return sectionsWithContent >= 2;
}
private isArticleEntry(entry: Entry): boolean {
try {
if (entry.isRedirect) return false;
const item = entry.item;
const mimeType = item.mimetype;
return mimeType === 'text/html' || mimeType === 'application/xhtml+xml';
} catch {
return false;
}
}
}

View File

@ -43,7 +43,7 @@ interface IZimService {
@inject()
export class ZimService implements IZimService {
constructor(private dockerService: DockerService) {}
constructor(private dockerService: DockerService) { }
async list() {
const dirPath = join(process.cwd(), ZIM_STORAGE_PATH)
@ -275,7 +275,7 @@ export class ZimService implements IZimService {
}
if (restart) {
// Check if there are any remaining ZIM download jobs before restarting
// Check if there are any remaining ZIM download jobs before restarting
const { QueueService } = await import('./queue_service.js')
const queueService = new QueueService()
const queue = queueService.getQueue('downloads')
@ -489,11 +489,11 @@ export class ZimService implements IZimService {
options,
currentSelection: selection
? {
optionId: selection.option_id,
status: selection.status,
filename: selection.filename,
url: selection.url,
}
optionId: selection.option_id,
status: selection.status,
filename: selection.filename,
url: selection.url,
}
: null,
}
}

View File

@ -152,7 +152,7 @@ export function matchesDevice(fsPath: string, deviceName: string): boolean {
return false
}
export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'unknown' {
export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'zim' | 'unknown' {
const ext = path.extname(filename).toLowerCase()
if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) {
return 'image'
@ -160,6 +160,8 @@ export function determineFileType(filename: string): 'image' | 'pdf' | 'text' |
return 'pdf'
} else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) {
return 'text'
} else if (ext === '.zim') {
return 'zim'
} else {
return 'unknown'
}

View File

@ -0,0 +1,48 @@
export const HTML_SELECTORS_TO_REMOVE = [
'script',
'style',
'nav',
'header',
'footer',
'noscript',
'iframe',
'svg',
'.navbox',
'.sidebar',
'.infobox',
'.mw-editsection',
'.reference',
'.reflist',
'.toc',
'.noprint',
'.mw-jump-link',
'.mw-headline-anchor',
'[role="navigation"]',
'.navbar',
'.hatnote',
'.ambox',
'.sistersitebox',
'.portal',
'#coordinates',
'.geo-nondefault',
'.authority-control',
]
// Common heading names that usually don't have meaningful content under them
export const NON_CONTENT_HEADING_PATTERNS = [
/^see also$/i,
/^references$/i,
/^external links$/i,
/^further reading$/i,
/^notes$/i,
/^bibliography$/i,
/^navigation$/i,
]
/**
* Batch size for processing ZIM articles to prevent lock timeout errors.
* Processing 50 articles at a time balances throughput with job duration.
* Typical processing time: 2-5 minutes per batch depending on article complexity.
*/
export const ZIM_BATCH_SIZE = 50

1501
admin/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -76,6 +76,7 @@
"@headlessui/react": "^2.2.4",
"@inertiajs/react": "^2.0.13",
"@markdoc/markdoc": "^0.5.2",
"@openzim/libzim": "^4.0.0",
"@protomaps/basemaps": "^5.7.0",
"@qdrant/js-client-rest": "^1.16.2",
"@tabler/icons-react": "^3.34.0",
@ -92,6 +93,7 @@
"axios": "^1.13.1",
"better-sqlite3": "^12.1.1",
"bullmq": "^5.65.1",
"cheerio": "^1.2.0",
"dockerode": "^4.0.7",
"edge.js": "^6.2.1",
"fast-xml-parser": "^5.2.5",

View File

@ -64,3 +64,47 @@ export type RemoteZimFileEntry = {
author: string
file_name: string
}
export type ExtractZIMContentOptions = {
strategy?: ExtractZIMChunkingStrategy
maxArticles?: number
onProgress?: (processedArticles: number, totalArticles: number) => void
// Batch processing options to avoid lock timeouts
startOffset?: number // Article index to start from for resuming
batchSize?: number // Max articles to process in this batch
}
export type ExtractZIMChunkingStrategy = 'structured' | 'simple'
export type ZIMArchiveMetadata = {
title: string
creator: string
publisher: string
date: string
language: string
description: string
}
export type ZIMContentChunk = {
// Content
text: string
// Article-level context
articleTitle: string
articlePath: string
// Section-level context for structured chunks
sectionTitle: string
fullTitle: string // Combined "Article Title - Section Title"
hierarchy: string // Breadcrumb trail
sectionLevel?: number // Heading level (2=h2, 3=h3, etc.)
// Document grouping
documentId: string // Same for all chunks from one article
// Archive metadata
archiveMetadata: ZIMArchiveMetadata
// Extraction metadata
strategy: ExtractZIMChunkingStrategy
}