feat: zim content embedding

2026-05-12 16:10:11 +02:00 · 2026-02-08 12:40:50 -08:00 · 2026-02-08 12:40:50 -08:00 · 8726700a0a
commit 8726700a0a
parent c2b6e079af
12 changed files with 2289 additions and 204 deletions
--- a/admin/app/controllers/rag_controller.ts
+++ b/admin/app/controllers/rag_controller.ts
@ -5,7 +5,6 @@ import type { HttpContext } from '@adonisjs/core/http'
 import app from '@adonisjs/core/services/app'
 import { randomBytes } from 'node:crypto'
 import { sanitizeFilename } from '../utils/fs.js'
-import { stat } from 'node:fs/promises'
 import { getJobStatusSchema } from '#validators/rag'

@inject()
@ -28,20 +27,10 @@ export default class RagController {
      name: fileName,
    })

-    // Get file size for tracking
-    let fileSize: number | undefined = undefined
-    try {
-      const stats = await stat(fullPath)
-      fileSize = stats.size
-    } catch (error) {
-      // Not critical if we can't get file size, just swallow the error
-    }
-
    // Dispatch background job for embedding
    const result = await EmbedFileJob.dispatch({
      filePath: fullPath,
      fileName,
-      fileSize,
    })

    return response.status(202).json({
--- a/admin/app/jobs/embed_file_job.ts
+++ b/admin/app/jobs/embed_file_job.ts
@ -10,6 +10,10 @@ export interface EmbedFileJobParams {
  filePath: string
  fileName: string
  fileSize?: number
+  // Batch processing for large ZIM files
+  batchOffset?: number  // Current batch offset (for ZIM files)
+  totalArticles?: number // Total articles in ZIM (for progress tracking)
+  isFinalBatch?: boolean // Whether this is the last batch (prevents premature deletion)
 }

 export class EmbedFileJob {
@ -26,9 +30,11 @@ export class EmbedFileJob {
  }

  async handle(job: Job) {
-    const { filePath, fileName } = job.data as EmbedFileJobParams
+    const { filePath, fileName, batchOffset, totalArticles } = job.data as EmbedFileJobParams

-    logger.info(`[EmbedFileJob] Starting embedding process for: ${fileName}`)
+    const isZimBatch = batchOffset !== undefined
+    const batchInfo = isZimBatch ? ` (batch offset: ${batchOffset})` : ''
+    logger.info(`[EmbedFileJob] Starting embedding process for: ${fileName}${batchInfo}`)

    const dockerService = new DockerService()
    const ollamaService = new OllamaService()
@ -55,30 +61,78 @@ export class EmbedFileJob {
      await job.updateData({
        ...job.data,
        status: 'processing',
-        startedAt: Date.now(),
+        startedAt: job.data.startedAt || Date.now(),
      })

      logger.info(`[EmbedFileJob] Processing file: ${filePath}`)

      // Process and embed the file
-      const result = await ragService.processAndEmbedFile(filePath)
+      // Only allow deletion if explicitly marked as final batch
+      const allowDeletion = job.data.isFinalBatch === true
+      const result = await ragService.processAndEmbedFile(
+        filePath,
+        allowDeletion,
+        batchOffset
+      )

      if (!result.success) {
        logger.error(`[EmbedFileJob] Failed to process file ${fileName}: ${result.message}`)
        throw new Error(result.message)
      }

-      // Update progress complete
+      // For ZIM files with batching, check if more batches are needed
+      if (result.hasMoreBatches) {
+        const nextOffset = (batchOffset || 0) + (result.articlesProcessed || 0)
+        logger.info(
+          `[EmbedFileJob] Batch complete. Dispatching next batch at offset ${nextOffset}`
+        )
+
+        // Dispatch next batch (not final yet)
+        await EmbedFileJob.dispatch({
+          filePath,
+          fileName,
+          batchOffset: nextOffset,
+          totalArticles: totalArticles || result.totalArticles,
+          isFinalBatch: false, // Explicitly not final
+        })
+
+        // Calculate progress based on articles processed
+        const progress = totalArticles
+          ? Math.round((nextOffset / totalArticles) * 100)
+          : 50
+
+        await job.updateProgress(progress)
+        await job.updateData({
+          ...job.data,
+          status: 'batch_completed',
+          lastBatchAt: Date.now(),
+          chunks: (job.data.chunks || 0) + (result.chunks || 0),
+        })
+
+        return {
+          success: true,
+          fileName,
+          filePath,
+          chunks: result.chunks,
+          hasMoreBatches: true,
+          nextOffset,
+          message: `Batch embedded ${result.chunks} chunks, next batch queued`,
+        }
+      }
+
+      // Final batch or non-batched file - mark as complete
+      const totalChunks = (job.data.chunks || 0) + (result.chunks || 0)
      await job.updateProgress(100)
      await job.updateData({
        ...job.data,
        status: 'completed',
        completedAt: Date.now(),
-        chunks: result.chunks,
+        chunks: totalChunks,
      })

+      const batchMsg = isZimBatch ? ` (final batch, total chunks: ${totalChunks})` : ''
      logger.info(
-        `[EmbedFileJob] Successfully embedded ${result.chunks} chunks from file: ${fileName}`
+        `[EmbedFileJob] Successfully embedded ${result.chunks} chunks from file: ${fileName}${batchMsg}`
      )

      return {
--- a/admin/app/jobs/run_download_job.ts
+++ b/admin/app/jobs/run_download_job.ts
@ -6,6 +6,7 @@ import { createHash } from 'crypto'
 import { DockerService } from '#services/docker_service'
 import { ZimService } from '#services/zim_service'
 import { MapService } from '#services/map_service'
+import { EmbedFileJob } from './embed_file_job.js'

 export class RunDownloadJob {
  static get queue() {
@ -24,17 +25,6 @@ export class RunDownloadJob {
    const { url, filepath, timeout, allowedMimeTypes, forceNew, filetype } =
      job.data as RunDownloadJobParams

-    //    console.log("Simulating delay for job for URL:", url)
-    //  await new Promise((resolve) => setTimeout(resolve, 30000)) // Simulate initial delay
-    //  console.log("Starting download for URL:", url)
-
-    // // simulate progress updates for demonstration
-    // for (let progress = 0; progress <= 100; progress += 10) {
-    //   await new Promise((resolve) => setTimeout(resolve, 20000)) // Simulate time taken for each progress step
-    //   job.updateProgress(progress)
-    //   console.log(`Job progress for URL ${url}: ${progress}%`)
-    // }
-
    await doResumableDownload({
      url,
      filepath,
@ -51,6 +41,16 @@ export class RunDownloadJob {
            const dockerService = new DockerService()
            const zimService = new ZimService(dockerService)
            await zimService.downloadRemoteSuccessCallback([url], true)
+
+            // Dispatch an embedding job for the downloaded ZIM file
+            try {
+              await EmbedFileJob.dispatch({
+                fileName: url.split('/').pop() || '',
+                filePath: filepath,
+              })
+            } catch (error) {
+              console.error(`[RunDownloadJob] Error dispatching EmbedFileJob for URL ${url}:`, error)
+            }
          } else if (filetype === 'map') {
            const mapsService = new MapService()
            await mapsService.downloadRemoteSuccessCallback([url], false)
--- a/admin/app/services/docs_service.ts
+++ b/admin/app/services/docs_service.ts
@ -3,6 +3,7 @@ import { streamToString } from '../../util/docs.js'
 import { getFile, getFileStatsIfExists, listDirectoryContentsRecursive } from '../utils/fs.js'
 import path from 'path'
 import InternalServerErrorException from '#exceptions/internal_server_error_exception'
+import logger from '@adonisjs/core/services/logger'

 export class DocsService {
  private docsPath = path.join(process.cwd(), 'docs')
@ -46,13 +47,13 @@ export class DocsService {
      // Filter out attribute-undefined errors which may be caused by emojis and special characters
      const criticalErrors = errors.filter((e) => e.error.id !== 'attribute-undefined')
      if (criticalErrors.length > 0) {
-        console.error('Markdoc validation errors:', errors.map((e) => JSON.stringify(e.error)).join(', '))
+        logger.error('Markdoc validation errors:', errors.map((e) => JSON.stringify(e.error)).join(', '))
        throw new Error('Markdoc validation failed')
      }

      return Markdoc.transform(ast, config)
    } catch (error) {
-      console.log('Error parsing Markdoc content:', error)
+      logger.error('Error parsing Markdoc content:', error)
      throw new InternalServerErrorException(`Error parsing content: ${(error as Error).message}`)
    }
  }
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@ -15,6 +15,8 @@ import { randomUUID } from 'node:crypto'
 import { join } from 'node:path'
 import KVStore from '#models/kv_store'
 import { parseBoolean } from '../utils/misc.js'
+import { ZIMExtractionService } from './zim_extraction_service.js'
+import { ZIM_BATCH_SIZE } from '../../constants/zim_extraction.js'

@inject()
 export class RagService {
@ -38,6 +40,67 @@ export class RagService {
    private ollamaService: OllamaService
  ) { }

+  private async _initializeQdrantClient() {
+    if (!this.qdrantInitPromise) {
+      this.qdrantInitPromise = (async () => {
+        const qdrantUrl = await this.dockerService.getServiceURL(SERVICE_NAMES.QDRANT)
+        if (!qdrantUrl) {
+          throw new Error('Qdrant service is not installed or running.')
+        }
+        this.qdrant = new QdrantClient({ url: qdrantUrl })
+      })()
+    }
+    return this.qdrantInitPromise
+  }
+
+  private async _ensureDependencies() {
+    if (!this.qdrant) {
+      await this._initializeQdrantClient()
+    }
+  }
+
+  private async _ensureCollection(
+    collectionName: string,
+    dimensions: number = RagService.EMBEDDING_DIMENSION
+  ) {
+    try {
+      await this._ensureDependencies()
+      const collections = await this.qdrant!.getCollections()
+      const collectionExists = collections.collections.some((col) => col.name === collectionName)
+
+      if (!collectionExists) {
+        await this.qdrant!.createCollection(collectionName, {
+          vectors: {
+            size: dimensions,
+            distance: 'Cosine',
+          },
+        })
+      }
+    } catch (error) {
+      logger.error('Error ensuring Qdrant collection:', error)
+      throw error
+    }
+  }
+
+  /**
+   * Sanitizes text to ensure it's safe for JSON encoding and Qdrant storage.
+   * Removes problematic characters that can cause "unexpected end of hex escape" errors:
+   * - Null bytes (\x00)
+   * - Invalid Unicode sequences
+   * - Control characters (except newlines, tabs, and carriage returns)
+   */
+  private sanitizeText(text: string): string {
+    return text
+      // Null bytes
+      .replace(/\x00/g, '')
+      // Problematic control characters (keep \n, \r, \t)
+      .replace(/[\x01-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, '')
+      // Invalid Unicode surrogates
+      .replace(/[\uD800-\uDFFF]/g, '')
+      // Trim extra whitespace
+      .trim()
+  }
+
  /**
   * Estimates token count for text. This is a conservative approximation:
   * - English text: ~1 token per 3 characters
@ -114,48 +177,6 @@ export class RagService {
    return [...new Set(keywords)]
  }

-  private async _initializeQdrantClient() {
-    if (!this.qdrantInitPromise) {
-      this.qdrantInitPromise = (async () => {
-        const qdrantUrl = await this.dockerService.getServiceURL(SERVICE_NAMES.QDRANT)
-        if (!qdrantUrl) {
-          throw new Error('Qdrant service is not installed or running.')
-        }
-        this.qdrant = new QdrantClient({ url: qdrantUrl })
-      })()
-    }
-    return this.qdrantInitPromise
-  }
-
-  private async _ensureDependencies() {
-    if (!this.qdrant) {
-      await this._initializeQdrantClient()
-    }
-  }
-
-  private async _ensureCollection(
-    collectionName: string,
-    dimensions: number = RagService.EMBEDDING_DIMENSION
-  ) {
-    try {
-      await this._ensureDependencies()
-      const collections = await this.qdrant!.getCollections()
-      const collectionExists = collections.collections.some((col) => col.name === collectionName)
-
-      if (!collectionExists) {
-        await this.qdrant!.createCollection(collectionName, {
-          vectors: {
-            size: dimensions,
-            distance: 'Cosine',
-          },
-        })
-      }
-    } catch (error) {
-      logger.error('Error ensuring Qdrant collection:', error)
-      throw error
-    }
-  }
-
  public async embedAndStoreText(
    text: string,
    metadata: Record<string, any> = {}
@ -237,21 +258,45 @@ export class RagService {

      const timestamp = Date.now()
      const points = chunks.map((chunkText, index) => {
-        // Extract keywords for hybrid search
-        const keywords = this.extractKeywords(chunkText)
-        logger.debug(`[RAG] Extracted keywords for chunk ${index}: [${keywords.join(', ')}]`)
+        // Sanitize text to prevent JSON encoding errors
+        const sanitizedText = this.sanitizeText(chunkText)
+
+        // Extract keywords from content
+        const contentKeywords = this.extractKeywords(sanitizedText)
+
+        // For ZIM content, also extract keywords from structural metadata
+        let structuralKeywords: string[] = []
+        if (metadata.full_title) {
+          structuralKeywords = this.extractKeywords(metadata.full_title as string)
+        } else if (metadata.article_title) {
+          structuralKeywords = this.extractKeywords(metadata.article_title as string)
+        }
+
+        // Combine and dedup keywords
+        const allKeywords = [...new Set([...structuralKeywords, ...contentKeywords])]
+
+        logger.debug(`[RAG] Extracted keywords for chunk ${index}: [${allKeywords.join(', ')}]`)
+        if (structuralKeywords.length > 0) {
+          logger.debug(`[RAG]   - Structural: [${structuralKeywords.join(', ')}], Content: [${contentKeywords.join(', ')}]`)
+        }
+
+        // Sanitize source metadata as well
+        const sanitizedSource = typeof metadata.source === 'string'
+          ? this.sanitizeText(metadata.source)
+          : 'unknown'
+
        return {
          id: randomUUID(), // qdrant requires either uuid or unsigned int
          vector: embeddings[index],
          payload: {
            ...metadata,
-            text: chunkText,
+            text: sanitizedText,
            chunk_index: index,
            total_chunks: chunks.length,
-            keywords: keywords.join(' '), // Store as space-separated string for text search
-            char_count: chunkText.length,
+            keywords: allKeywords.join(' '), // store as space-separated string for text search
+            char_count: sanitizedText.length,
            created_at: timestamp,
-            source: metadata.source || 'unknown'
+            source: sanitizedSource
          },
        }
      })
@ -269,12 +314,6 @@ export class RagService {
    }
  }

-  /**
-   * Preprocess an image to enhance text extraction quality.
-   * Normalizes, grayscales, sharpens, and resizes the image to a manageable size.
-   * @param filebuffer Buffer of the image file
-   * @returns - Processed image buffer
-   */
  private async preprocessImage(filebuffer: Buffer): Promise<Buffer> {
    return await sharp(filebuffer)
      .grayscale()
@ -284,12 +323,6 @@ export class RagService {
      .toBuffer()
  }

-  /**
-   * If the original PDF has little to no extractable text,
-   * we can use this method to convert each page to an image for OCR processing.
-   * @param filebuffer - Buffer of the PDF file
-   * @returns - Array of image buffers, one per page
-   */
  private async convertPDFtoImages(filebuffer: Buffer): Promise<Buffer[]> {
    const converted = await fromBuffer(filebuffer, {
      quality: 50,
@ -301,11 +334,6 @@ export class RagService {
    return converted.filter((res) => res.buffer).map((res) => res.buffer!)
  }

-  /**
-   * Extract text from a PDF file using pdf-parse.
-   * @param filebuffer - Buffer of the PDF file
-   * @returns - Extracted text
-   */
  private async extractPDFText(filebuffer: Buffer): Promise<string> {
    const parser = new PDFParse({ data: filebuffer })
    const data = await parser.getText()
@ -313,20 +341,10 @@ export class RagService {
    return data.text
  }

-  /**
-   * Extract text from a plain text file.
-   * @param filebuffer - Buffer of the text file
-   * @returns - Extracted text
-   */
  private async extractTXTText(filebuffer: Buffer): Promise<string> {
    return filebuffer.toString('utf-8')
  }

-  /**
-   * Extract text from an image file using Tesseract.js OCR.
-   * @param filebuffer - Buffer of the image file
-   * @returns - Extracted text
-   */
  private async extractImageText(filebuffer: Buffer): Promise<string> {
    const worker = await createWorker('eng')
    const result = await worker.recognize(filebuffer)
@ -334,71 +352,229 @@ export class RagService {
    return result.data.text
  }

+  private async processImageFile(fileBuffer: Buffer): Promise<string> {
+    const preprocessedBuffer = await this.preprocessImage(fileBuffer)
+    return await this.extractImageText(preprocessedBuffer)
+  }
+
+  /**
+   * Will process the PDF and attempt to extract text.
+   * If the extracted text is minimal, it will fallback to OCR on each page.
+   */
+  private async processPDFFile(fileBuffer: Buffer): Promise<string> {
+    let extractedText = await this.extractPDFText(fileBuffer)
+
+    // Check if there was no extracted text or it was very minimal
+    if (!extractedText || extractedText.trim().length < 100) {
+      logger.debug('[RAG] PDF text extraction minimal, attempting OCR on pages')
+      // Convert PDF pages to images for OCR if text extraction was poor
+      const imageBuffers = await this.convertPDFtoImages(fileBuffer)
+      extractedText = ''
+
+      for (const imgBuffer of imageBuffers) {
+        const preprocessedImg = await this.preprocessImage(imgBuffer)
+        const pageText = await this.extractImageText(preprocessedImg)
+        extractedText += pageText + '\n'
+      }
+    }
+
+    return extractedText
+  }
+
+  /**
+   * Process a ZIM file: extract content with metadata and embed each chunk.
+   * Returns early with complete result since ZIM processing is self-contained.
+   * Supports batch processing to prevent lock timeouts on large ZIM files.
+   */
+  private async processZIMFile(
+    filepath: string,
+    deleteAfterEmbedding: boolean,
+    batchOffset?: number
+  ): Promise<{
+    success: boolean
+    message: string
+    chunks?: number
+    hasMoreBatches?: boolean
+    articlesProcessed?: number
+    totalArticles?: number
+  }> {
+    const zimExtractionService = new ZIMExtractionService()
+    
+    // Process in batches to avoid lock timeout
+    const startOffset = batchOffset || 0
+    
+    logger.info(
+      `[RAG] Extracting ZIM content (batch: offset=${startOffset}, size=${ZIM_BATCH_SIZE})`
+    )
+    
+    const zimChunks = await zimExtractionService.extractZIMContent(filepath, {
+      startOffset,
+      batchSize: ZIM_BATCH_SIZE,
+    })
+
+    logger.info(
+      `[RAG] Extracted ${zimChunks.length} chunks from ZIM file with enhanced metadata`
+    )
+
+    // Process each chunk individually with its metadata
+    let totalChunks = 0
+    for (const zimChunk of zimChunks) {
+      const result = await this.embedAndStoreText(zimChunk.text, {
+        source: filepath,
+        content_type: 'zim_article',
+
+        // Article-level context
+        article_title: zimChunk.articleTitle,
+        article_path: zimChunk.articlePath,
+
+        // Section-level context
+        section_title: zimChunk.sectionTitle,
+        full_title: zimChunk.fullTitle,
+        hierarchy: zimChunk.hierarchy,
+        section_level: zimChunk.sectionLevel,
+
+        // Use the same document ID for all chunks from the same article for grouping in search results
+        document_id: zimChunk.documentId,
+
+        // Archive metadata
+        archive_title: zimChunk.archiveMetadata.title,
+        archive_creator: zimChunk.archiveMetadata.creator,
+        archive_publisher: zimChunk.archiveMetadata.publisher,
+        archive_date: zimChunk.archiveMetadata.date,
+        archive_language: zimChunk.archiveMetadata.language,
+        archive_description: zimChunk.archiveMetadata.description,
+
+        // Extraction metadata - not overly relevant for search, but could be useful for debugging and future features...
+        extraction_strategy: zimChunk.strategy,
+      })
+
+      if (result) {
+        totalChunks += result.chunks
+      }
+    }
+
+    // Count unique articles processed in this batch
+    const articlesInBatch = new Set(zimChunks.map((c) => c.documentId)).size
+    const hasMoreBatches = zimChunks.length === ZIM_BATCH_SIZE
+
+    logger.info(
+      `[RAG] Successfully embedded ${totalChunks} total chunks from ${articlesInBatch} articles (hasMore: ${hasMoreBatches})`
+    )
+
+    // Only delete the file when:
+    // 1. deleteAfterEmbedding is true (caller wants deletion)
+    // 2. No more batches remain (this is the final batch)
+    // This prevents race conditions where early batches complete after later ones
+    const shouldDelete = deleteAfterEmbedding && !hasMoreBatches
+    if (shouldDelete) {
+      logger.info(`[RAG] Final batch complete, deleting ZIM file: ${filepath}`)
+      await deleteFileIfExists(filepath)
+    } else if (!hasMoreBatches) {
+      logger.info(`[RAG] Final batch complete, but file deletion was not requested`)
+    }
+
+    return {
+      success: true,
+      message: hasMoreBatches
+        ? 'ZIM batch processed successfully. More batches remain.'
+        : 'ZIM file processed and embedded successfully with enhanced metadata.',
+      chunks: totalChunks,
+      hasMoreBatches,
+      articlesProcessed: articlesInBatch,
+    }
+  }
+
+  private async processTextFile(fileBuffer: Buffer): Promise<string> {
+    return await this.extractTXTText(fileBuffer)
+  }
+
+  private async embedTextAndCleanup(
+    extractedText: string,
+    filepath: string,
+    deleteAfterEmbedding: boolean = false
+  ): Promise<{ success: boolean; message: string; chunks?: number }> {
+    if (!extractedText || extractedText.trim().length === 0) {
+      return { success: false, message: 'Process completed succesfully, but no text was found to embed.' }
+    }
+
+    const embedResult = await this.embedAndStoreText(extractedText, {
+      source: filepath
+    })
+
+    if (!embedResult) {
+      return { success: false, message: 'Failed to embed and store the extracted text.' }
+    }
+
+    if (deleteAfterEmbedding) {
+      logger.info(`[RAG] Embedding complete, deleting uploaded file: ${filepath}`)
+      await deleteFileIfExists(filepath)
+    }
+
+    return {
+      success: true,
+      message: 'File processed and embedded successfully.',
+      chunks: embedResult.chunks,
+    }
+  }
+
  /**
   * Main pipeline to process and embed an uploaded file into the RAG knowledge base.
   * This includes text extraction, chunking, embedding, and storing in Qdrant.
+   * 
+   * Orchestrates file type detection and delegates to specialized processors.
+   * For ZIM files, supports batch processing via batchOffset parameter.
   */
  public async processAndEmbedFile(
-    filepath: string, // Should already be the full path to the uploaded file
-    deleteAfterEmbedding: boolean = false
-  ): Promise<{ success: boolean; message: string; chunks?: number }> {
+    filepath: string,
+    deleteAfterEmbedding: boolean = false,
+    batchOffset?: number
+  ): Promise<{
+    success: boolean
+    message: string
+    chunks?: number
+    hasMoreBatches?: boolean
+    articlesProcessed?: number
+    totalArticles?: number
+  }> {
    try {
      const fileType = determineFileType(filepath)
+      logger.debug(`[RAG] Processing file: ${filepath} (detected type: ${fileType})`)
+
      if (fileType === 'unknown') {
        return { success: false, message: 'Unsupported file type.' }
      }

-      const origFileBuffer = await getFile(filepath, 'buffer')
-      if (!origFileBuffer) {
+      // Read file buffer (not needed for ZIM as it reads directly)
+      const fileBuffer = fileType !== 'zim' ? await getFile(filepath, 'buffer') : null
+      if (fileType !== 'zim' && !fileBuffer) {
        return { success: false, message: 'Failed to read the uploaded file.' }
      }

-      let extractedText = ''
-
-      if (fileType === 'image') {
-        const preprocessedBuffer = await this.preprocessImage(origFileBuffer)
-        extractedText = await this.extractImageText(preprocessedBuffer)
-      } else if (fileType === 'pdf') {
-        extractedText = await this.extractPDFText(origFileBuffer)
-        // Check if there was no extracted text or it was very minimal
-        if (!extractedText || extractedText.trim().length < 100) {
-          // Convert PDF pages to images for OCR
-          const imageBuffers = await this.convertPDFtoImages(origFileBuffer)
-          for (const imgBuffer of imageBuffers) {
-            const preprocessedImg = await this.preprocessImage(imgBuffer)
-            const pageText = await this.extractImageText(preprocessedImg)
-            extractedText += pageText + '\n'
-          }
-        }
-      } else {
-        extractedText = await this.extractTXTText(origFileBuffer)
+      // Process based on file type
+      // ZIM files are handled specially since they have their own embedding workflow
+      if (fileType === 'zim') {
+        return await this.processZIMFile(filepath, deleteAfterEmbedding, batchOffset)
      }

-      if (!extractedText || extractedText.trim().length === 0) {
-        return { success: false, message: 'No text could be extracted from the file.' }
+      // Extract text based on file type
+      let extractedText: string
+      switch (fileType) {
+        case 'image':
+          extractedText = await this.processImageFile(fileBuffer!)
+          break
+        case 'pdf':
+          extractedText = await this.processPDFFile(fileBuffer!)
+          break
+        case 'text':
+        default:
+          extractedText = await this.processTextFile(fileBuffer!)
+          break
      }

-      const embedResult = await this.embedAndStoreText(extractedText, {
-        source: filepath
-      })
-
-      if (!embedResult) {
-        return { success: false, message: 'Failed to embed and store the extracted text.' }
-      }
-
-      if (deleteAfterEmbedding) {
-        // Cleanup the file from disk
-        logger.info(`[RAG] Embedding complete, deleting uploaded file: ${filepath}`)
-        await deleteFileIfExists(filepath)
-      }
-
-      return {
-        success: true,
-        message: 'File processed and embedded successfully.',
-        chunks: embedResult?.chunks,
-      }
+      // Embed extracted text and cleanup
+      return await this.embedTextAndCleanup(extractedText, filepath, deleteAfterEmbedding)
    } catch (error) {
-      logger.error('Error processing and embedding file:', error)
+      logger.error('[RAG] Error processing and embedding file:', error)
      return { success: false, message: 'Error processing and embedding file.' }
    }
  }
@ -497,6 +673,13 @@ export class RagService {
        keywords: (result.payload?.keywords as string) || '',
        chunk_index: (result.payload?.chunk_index as number) || 0,
        created_at: (result.payload?.created_at as number) || 0,
+        // Enhanced ZIM metadata (likely be undefined for non-ZIM content)
+        article_title: result.payload?.article_title as string | undefined,
+        section_title: result.payload?.section_title as string | undefined,
+        full_title: result.payload?.full_title as string | undefined,
+        hierarchy: result.payload?.hierarchy as string | undefined,
+        document_id: result.payload?.document_id as string | undefined,
+        content_type: result.payload?.content_type as string | undefined,
      }))

      const rerankedResults = this.rerankResults(resultsWithMetadata, keywords, query)
@ -508,7 +691,7 @@ export class RagService {
        )
      })

-      // Return top N results
+      // Return top N results with enhanced metadata
      return rerankedResults.slice(0, limit).map((result) => ({
        text: result.text,
        score: result.finalScore,
@ -516,6 +699,13 @@ export class RagService {
          chunk_index: result.chunk_index,
          created_at: result.created_at,
          semantic_score: result.score,
+          // Enhanced ZIM metadata (likely be undefined for non-ZIM content)
+          article_title: result.article_title,
+          section_title: result.section_title,
+          full_title: result.full_title,
+          hierarchy: result.hierarchy,
+          document_id: result.document_id,
+          content_type: result.content_type,
        },
      }))
    } catch (error) {
@ -544,6 +734,12 @@ export class RagService {
      keywords: string
      chunk_index: number
      created_at: number
+      article_title?: string
+      section_title?: string
+      full_title?: string
+      hierarchy?: string
+      document_id?: string
+      content_type?: string
    }>,
    queryKeywords: string[],
    originalQuery: string
@ -553,6 +749,12 @@ export class RagService {
    finalScore: number
    chunk_index: number
    created_at: number
+    article_title?: string
+    section_title?: string
+    full_title?: string
+    hierarchy?: string
+    document_id?: string
+    content_type?: string
  }> {
    return results
      .map((result) => {
@ -711,11 +913,9 @@ export class RagService {
      for (const fileInfo of filesToEmbed) {
        try {
          logger.info(`[RAG] Dispatching embed job for: ${fileInfo.source}`)
-          const stats = await getFileStatsIfExists(fileInfo.path)
          await EmbedFileJob.dispatch({
            filePath: fileInfo.path,
            fileName: fileInfo.source,
-            fileSize: stats?.size,
          })
          logger.info(`[RAG] Successfully dispatched job for ${fileInfo.source}`)
        } catch (fileError) {
--- a/admin/app/services/zim_extraction_service.ts
+++ b/admin/app/services/zim_extraction_service.ts
@ -0,0 +1,310 @@
+import { Archive, Entry } from '@openzim/libzim'
+import * as cheerio from 'cheerio'
+import { HTML_SELECTORS_TO_REMOVE, NON_CONTENT_HEADING_PATTERNS } from '../../constants/zim_extraction.js'
+import logger from '@adonisjs/core/services/logger'
+import { ExtractZIMChunkingStrategy, ExtractZIMContentOptions, ZIMContentChunk, ZIMArchiveMetadata } from '../../types/zim.js'
+import { randomUUID } from 'node:crypto'
+import { access } from 'node:fs/promises'
+
+export class ZIMExtractionService {
+
+    private extractArchiveMetadata(archive: Archive): ZIMArchiveMetadata {
+        try {
+            return {
+                title: archive.getMetadata('Title') || archive.getMetadata('Name') || 'Unknown',
+                creator: archive.getMetadata('Creator') || 'Unknown',
+                publisher: archive.getMetadata('Publisher') || 'Unknown',
+                date: archive.getMetadata('Date') || 'Unknown',
+                language: archive.getMetadata('Language') || 'Unknown',
+                description: archive.getMetadata('Description') || '',
+            }
+        } catch (error) {
+            logger.warn('[ZIMExtractionService]: Could not extract all metadata, using defaults', error)
+            return {
+                title: 'Unknown',
+                creator: 'Unknown',
+                publisher: 'Unknown',
+                date: 'Unknown',
+                language: 'Unknown',
+                description: '',
+            }
+        }
+    }
+
+    /**
+     * Breaks out a ZIM file's entries into their structured content form
+     * to facilitate better indexing and retrieval.
+     * Returns enhanced chunks with full article context and metadata.
+     * 
+     * @param filePath - Path to the ZIM file
+     * @param opts - Options including maxArticles, strategy, onProgress, startOffset, and batchSize
+     */
+    async extractZIMContent(filePath: string, opts: ExtractZIMContentOptions = {}): Promise<ZIMContentChunk[]> {
+        try {
+            logger.info(`[ZIMExtractionService]: Processing ZIM file at path: ${filePath}`)
+            
+            // defensive - check if file still exists before opening
+            // could have been deleted by another process or batch
+            try {
+                await access(filePath)
+            } catch (error) {
+                logger.error(`[ZIMExtractionService]: ZIM file not accessible: ${filePath}`)
+                throw new Error(`ZIM file not found or not accessible: ${filePath}`)
+            }
+            
+            const archive = new Archive(filePath)
+
+            // Extract archive-level metadata once
+            const archiveMetadata = this.extractArchiveMetadata(archive)
+            logger.info(`[ZIMExtractionService]: Archive metadata - Title: ${archiveMetadata.title}, Language: ${archiveMetadata.language}`)
+
+            let articlesProcessed = 0
+            let articlesSkipped = 0
+            const processedPaths = new Set<string>()
+            const toReturn: ZIMContentChunk[] = []
+
+            // Support batch processing to avoid lock timeouts on large ZIM files
+            const startOffset = opts.startOffset || 0
+            const batchSize = opts.batchSize || (opts.maxArticles || Infinity)
+
+            for (const entry of archive.iterByPath()) {
+                // Skip articles until we reach the start offset
+                if (articlesSkipped < startOffset) {
+                    if (this.isArticleEntry(entry) && !processedPaths.has(entry.path)) {
+                        articlesSkipped++
+                    }
+                    continue
+                }
+
+                if (articlesProcessed >= batchSize) {
+                    break
+                }
+
+                if (!this.isArticleEntry(entry)) {
+                    logger.debug(`[ZIMExtractionService]: Skipping non-article entry at path: ${entry.path}`)
+                    continue
+                }
+
+                if (processedPaths.has(entry.path)) {
+                    logger.debug(`[ZIMExtractionService]: Skipping duplicate entry at path: ${entry.path}`)
+                    continue
+                }
+                processedPaths.add(entry.path)
+
+                const item = entry.item
+                const blob = item.data
+                const html = this.getCleanedHTMLString(blob.data)
+
+                const strategy = opts.strategy || this.chooseChunkingStrategy(html);
+                logger.debug(`[ZIMExtractionService]: Chosen chunking strategy for path ${entry.path}: ${strategy}`)
+
+                // Generate a unique document ID. All chunks from same article will share it
+                const documentId = randomUUID()
+                const articleTitle = entry.title || entry.path
+
+                let chunks: ZIMContentChunk[]
+
+                if (strategy === 'structured') {
+                    const structured = this.extractStructuredContent(html)
+                    chunks = structured.sections.map(s => ({
+                        text: s.text,
+                        articleTitle,
+                        articlePath: entry.path,
+                        sectionTitle: s.heading,
+                        fullTitle: `${articleTitle} - ${s.heading}`,
+                        hierarchy: `${articleTitle} > ${s.heading}`,
+                        sectionLevel: s.level,
+                        documentId,
+                        archiveMetadata,
+                        strategy,
+                    }))
+                } else {
+                    // Simple strategy - entire article as one chunk
+                    const text = this.extractTextFromHTML(html) || ''
+                    chunks = [{
+                        text,
+                        articleTitle,
+                        articlePath: entry.path,
+                        sectionTitle: articleTitle, // Same as article for simple strategy
+                        fullTitle: articleTitle,
+                        hierarchy: articleTitle,
+                        documentId,
+                        archiveMetadata,
+                        strategy,
+                    }]
+                }
+
+                logger.debug(`Extracted ${chunks.length} chunks from article at path: ${entry.path} using strategy: ${strategy}`)
+
+                const nonEmptyChunks = chunks.filter(c => c.text.trim().length > 0)
+                logger.debug(`After filtering empty chunks, ${nonEmptyChunks.length} chunks remain for article at path: ${entry.path}`)
+                toReturn.push(...nonEmptyChunks)
+                articlesProcessed++
+
+                if (opts.onProgress) {
+                    opts.onProgress(articlesProcessed, archive.articleCount)
+                }
+            }
+
+            logger.info(`[ZIMExtractionService]: Completed processing ZIM file. Total articles processed: ${articlesProcessed}`)
+            logger.debug("Final structured content sample:", toReturn.slice(0, 3).map(c => ({
+                articleTitle: c.articleTitle,
+                sectionTitle: c.sectionTitle,
+                hierarchy: c.hierarchy,
+                textPreview: c.text.substring(0, 100)
+            })))
+            logger.debug("Total structured sections extracted:", toReturn.length)
+            return toReturn
+        } catch (error) {
+            logger.error('Error processing ZIM file:', error)
+            throw error
+        }
+    }
+
+    private chooseChunkingStrategy(html: string, options = {
+        forceStrategy: null as ExtractZIMChunkingStrategy | null,
+    }): ExtractZIMChunkingStrategy {
+        const {
+            forceStrategy = null,
+        } = options;
+
+        if (forceStrategy) return forceStrategy;
+
+        // Use a simple analysis to determin if the HTML has any meaningful structure
+        // that we can leverage for better chunking. If not, we'll just chunk it as one big piece of text.
+        return this.hasStructuredHeadings(html) ? 'structured' : 'simple';
+    }
+
+    private getCleanedHTMLString(buff: Buffer<ArrayBufferLike>): string {
+        const rawString = buff.toString('utf-8');
+        const $ = cheerio.load(rawString);
+
+        HTML_SELECTORS_TO_REMOVE.forEach((selector) => {
+            $(selector).remove()
+        });
+
+        return $.html();
+    }
+
+    private extractTextFromHTML(html: string): string | null {
+        try {
+            const $ = cheerio.load(html)
+
+            // Search body first, then root if body is absent
+            const text = $('body').length ? $('body').text() : $.root().text()
+
+            return text.replace(/\s+/g, ' ').replace(/\n\s*\n/g, '\n').trim()
+        } catch (error) {
+            logger.error('Error extracting text from HTML:', error)
+            return null
+        }
+    }
+
+    private extractStructuredContent(html: string) {
+        const $ = cheerio.load(html);
+
+        const title = $('h1').first().text().trim() || $('title').text().trim();
+
+        // Extract sections with their headings and heading levels
+        const sections: Array<{ heading: string; text: string; level: number }> = [];
+        let currentSection = { heading: 'Introduction', content: [] as string[], level: 2 };
+
+        $('body').children().each((_, element) => {
+            const $el = $(element);
+            const tagName = element.tagName?.toLowerCase();
+
+            if (['h2', 'h3', 'h4'].includes(tagName)) {
+                // Save current section if it has content
+                if (currentSection.content.length > 0) {
+                    sections.push({
+                        heading: currentSection.heading,
+                        text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
+                        level: currentSection.level,
+                    });
+                }
+                // Start new section
+                const level = parseInt(tagName.substring(1)); // Extract number from h2, h3, h4
+                currentSection = {
+                    heading: $el.text().replace(/\[edit\]/gi, '').trim(),
+                    content: [],
+                    level,
+                };
+            } else if (['p', 'ul', 'ol', 'dl', 'table'].includes(tagName)) {
+                const text = $el.text().trim();
+                if (text.length > 0) {
+                    currentSection.content.push(text);
+                }
+            }
+        });
+
+        // Push the last section if it has content
+        if (currentSection.content.length > 0) {
+            sections.push({
+                heading: currentSection.heading,
+                text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
+                level: currentSection.level,
+            });
+        }
+
+        return {
+            title,
+            sections,
+            fullText: sections.map(s => `${s.heading}\n${s.text}`).join('\n\n'),
+        };
+    }
+
+    private hasStructuredHeadings(html: string): boolean {
+        const $ = cheerio.load(html);
+
+        const headings = $('h2, h3').toArray();
+
+        // Consider it structured if it has at least 2 headings to break content into meaningful sections
+        if (headings.length < 2) return false;
+
+        // Check that headings have substantial content between them
+        let sectionsWithContent = 0;
+
+        for (const heading of headings) {
+            const $heading = $(heading);
+            const headingText = $heading.text().trim();
+
+            // Skip empty or very short headings, likely not meaningful
+            if (headingText.length < 3) continue;
+
+            // Skip common non-content headings
+            if (NON_CONTENT_HEADING_PATTERNS.some(pattern => pattern.test(headingText))) {
+                continue;
+            }
+
+            // Content until next heading
+            let contentLength = 0;
+            let $next = $heading.next();
+
+            while ($next.length && !$next.is('h1, h2, h3, h4')) {
+                contentLength += $next.text().trim().length;
+                $next = $next.next();
+            }
+
+            // Consider it a real section if it has at least 100 chars of content
+            if (contentLength >= 100) {
+                sectionsWithContent++;
+            }
+        }
+
+        // Require at least 2 sections with substantial content
+        return sectionsWithContent >= 2;
+    }
+
+    private isArticleEntry(entry: Entry): boolean {
+        try {
+            if (entry.isRedirect) return false;
+
+            const item = entry.item;
+            const mimeType = item.mimetype;
+
+            return mimeType === 'text/html' || mimeType === 'application/xhtml+xml';
+        } catch {
+            return false;
+        }
+    }
+}
--- a/admin/app/services/zim_service.ts
+++ b/admin/app/services/zim_service.ts
@ -43,7 +43,7 @@ interface IZimService {

@inject()
 export class ZimService implements IZimService {
-  constructor(private dockerService: DockerService) {}
+  constructor(private dockerService: DockerService) { }

  async list() {
    const dirPath = join(process.cwd(), ZIM_STORAGE_PATH)
@ -264,7 +264,7 @@ export class ZimService implements IZimService {
    }

    return downloadFilenames.length > 0 ? downloadFilenames : null
-  } 
+  }

  async downloadRemoteSuccessCallback(urls: string[], restart = true) {
    // Check if any URL is a Wikipedia download and handle it
@ -275,28 +275,28 @@ export class ZimService implements IZimService {
    }

    if (restart) {
-    // Check if there are any remaining ZIM download jobs before restarting
+      // Check if there are any remaining ZIM download jobs before restarting
      const { QueueService } = await import('./queue_service.js')
      const queueService = new QueueService()
      const queue = queueService.getQueue('downloads')
-      
+
      // Get all active and waiting jobs
      const [activeJobs, waitingJobs] = await Promise.all([
        queue.getActive(),
        queue.getWaiting(),
      ])
-      
+
      // Filter out completed jobs (progress === 100) to avoid race condition
      // where this job itself is still in the active queue
      const activeIncompleteJobs = activeJobs.filter((job) => {
        const progress = typeof job.progress === 'number' ? job.progress : 0
        return progress < 100
      })
-      
+
      // Check if any remaining incomplete jobs are ZIM downloads
      const allJobs = [...activeIncompleteJobs, ...waitingJobs]
      const hasRemainingZimJobs = allJobs.some((job) => job.data.filetype === 'zim')
-      
+
      if (hasRemainingZimJobs) {
        logger.info('[ZimService] Skipping container restart - more ZIM downloads pending')
      } else {
@ -364,7 +364,7 @@ export class ZimService implements IZimService {
    // Check each tier from highest to lowest (assuming tiers are ordered from low to high)
    // We check in reverse to find the highest fully-installed tier
    const reversedTiers = [...category.tiers].reverse()
-    
+
    for (const tier of reversedTiers) {
      const allResourcesInstalled = tier.resources.every((resource) => {
        // Check if resource is marked as downloaded in database
@ -408,7 +408,7 @@ export class ZimService implements IZimService {

      for (const collection of validated.collections) {
        const { resources, ...restCollection } = collection; // we'll handle resources separately
-        
+
        // Upsert the collection itself
        await CuratedCollection.updateOrCreate(
          { slug: restCollection.slug },
@ -489,11 +489,11 @@ export class ZimService implements IZimService {
      options,
      currentSelection: selection
        ? {
-            optionId: selection.option_id,
-            status: selection.status,
-            filename: selection.filename,
-            url: selection.url,
-          }
+          optionId: selection.option_id,
+          status: selection.status,
+          filename: selection.filename,
+          url: selection.url,
+        }
        : null,
    }
  }
--- a/admin/app/utils/fs.ts
+++ b/admin/app/utils/fs.ts
@ -152,7 +152,7 @@ export function matchesDevice(fsPath: string, deviceName: string): boolean {
  return false
 }

-export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'unknown' {
+export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'zim' | 'unknown' {
  const ext = path.extname(filename).toLowerCase()
  if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) {
    return 'image'
@ -160,6 +160,8 @@ export function determineFileType(filename: string): 'image' | 'pdf' | 'text' |
    return 'pdf'
  } else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) {
    return 'text'
+  } else if (ext === '.zim') {
+    return 'zim'
  } else {
    return 'unknown'
  }
--- a/admin/constants/zim_extraction.ts
+++ b/admin/constants/zim_extraction.ts
@ -0,0 +1,48 @@
+
+export const HTML_SELECTORS_TO_REMOVE = [
+    'script',
+    'style',
+    'nav',
+    'header',
+    'footer',
+    'noscript',
+    'iframe',
+    'svg',
+    '.navbox',
+    '.sidebar',
+    '.infobox',
+    '.mw-editsection',
+    '.reference',
+    '.reflist',
+    '.toc',
+    '.noprint',
+    '.mw-jump-link',
+    '.mw-headline-anchor',
+    '[role="navigation"]',
+    '.navbar',
+    '.hatnote',
+    '.ambox',
+    '.sistersitebox',
+    '.portal',
+    '#coordinates',
+    '.geo-nondefault',
+    '.authority-control',
+]
+
+// Common heading names that usually don't have meaningful content under them
+export const NON_CONTENT_HEADING_PATTERNS = [
+    /^see also$/i,
+    /^references$/i,
+    /^external links$/i,
+    /^further reading$/i,
+    /^notes$/i,
+    /^bibliography$/i,
+    /^navigation$/i,
+]
+
+/**
+ * Batch size for processing ZIM articles to prevent lock timeout errors.
+ * Processing 50 articles at a time balances throughput with job duration.
+ * Typical processing time: 2-5 minutes per batch depending on article complexity.
+ */
+export const ZIM_BATCH_SIZE = 50
--- a/admin/package-lock.json
+++ b/admin/package-lock.json
--- a/admin/package.json
+++ b/admin/package.json
@ -76,6 +76,7 @@
    "@headlessui/react": "^2.2.4",
    "@inertiajs/react": "^2.0.13",
    "@markdoc/markdoc": "^0.5.2",
+    "@openzim/libzim": "^4.0.0",
    "@protomaps/basemaps": "^5.7.0",
    "@qdrant/js-client-rest": "^1.16.2",
    "@tabler/icons-react": "^3.34.0",
@ -92,6 +93,7 @@
    "axios": "^1.13.1",
    "better-sqlite3": "^12.1.1",
    "bullmq": "^5.65.1",
+    "cheerio": "^1.2.0",
    "dockerode": "^4.0.7",
    "edge.js": "^6.2.1",
    "fast-xml-parser": "^5.2.5",
--- a/admin/types/zim.ts
+++ b/admin/types/zim.ts
@ -64,3 +64,47 @@ export type RemoteZimFileEntry = {
  author: string
  file_name: string
 }
+
+export type ExtractZIMContentOptions = {
+  strategy?: ExtractZIMChunkingStrategy
+  maxArticles?: number
+  onProgress?: (processedArticles: number, totalArticles: number) => void
+  // Batch processing options to avoid lock timeouts
+  startOffset?: number  // Article index to start from for resuming
+  batchSize?: number    // Max articles to process in this batch
+}
+
+export type ExtractZIMChunkingStrategy = 'structured' | 'simple'
+
+export type ZIMArchiveMetadata = {
+  title: string
+  creator: string
+  publisher: string
+  date: string
+  language: string
+  description: string
+}
+
+export type ZIMContentChunk = {
+  // Content
+  text: string
+
+  // Article-level context
+  articleTitle: string
+  articlePath: string
+
+  // Section-level context for structured chunks
+  sectionTitle: string
+  fullTitle: string // Combined "Article Title - Section Title"
+  hierarchy: string // Breadcrumb trail
+  sectionLevel?: number // Heading level (2=h2, 3=h3, etc.)
+
+  // Document grouping
+  documentId: string // Same for all chunks from one article
+
+  // Archive metadata
+  archiveMetadata: ZIMArchiveMetadata
+
+  // Extraction metadata
+  strategy: ExtractZIMChunkingStrategy
+}