From dc7abfd41aca53c4e81537051720d8f80824e295 Mon Sep 17 00:00:00 2001
From: brian <brian@ratlm.com>
Date: Fri, 13 Mar 2026 13:25:01 -0400
Subject: [PATCH] feat(rag): add EPUB file support for Knowledge Base uploads

EPUBs are ZIP archives containing structured XHTML content with semantic
chapter/section markup, making them well-suited for RAG text extraction
and chunking.

Changes:
- Add 'epub' to determineFileType() in utils/fs.ts
- Add processEPUBFile() method in rag_service.ts that:
  - Reads container.xml to locate the OPF manifest
  - Parses the OPF spine for correct reading order
  - Extracts text from each XHTML content document using cheerio
  - Falls back to all manifest items if no spine is found
- Wire epub case into processAndEmbedFile() switch
- Add jszip dependency for ZIP archive reading (cheerio already present)

Closes #253-adjacent (epub is a common format for Project Gutenberg
content and technical reference books)
---
 admin/app/services/rag_service.ts | 85 +++++++++++++++++++++++++++++++
 admin/app/utils/fs.ts             |  4 +-
 admin/package.json                |  3 +-
 3 files changed, 90 insertions(+), 2 deletions(-)
diff --git a/admin/app/services/rag_service.ts b/admin/app/services/rag_service.ts
index e6ac043..5f003d9 100644
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@@ -8,6 +8,8 @@ import { deleteFileIfExists, determineFileType, getFile, getFileStatsIfExists, l
 import { PDFParse } from 'pdf-parse'
 import { createWorker } from 'tesseract.js'
 import { fromBuffer } from 'pdf2pic'
+import JSZip from 'jszip'
+import * as cheerio from 'cheerio'
 import { OllamaService } from './ollama_service.js'
 import { SERVICE_NAMES } from '../../constants/service_names.js'
 import { removeStopwords } from 'stopword'
@@ -564,6 +566,86 @@ export class RagService {
     return await this.extractTXTText(fileBuffer)
   }
 
+  /**
+   * Extract text content from an EPUB file.
+   * EPUBs are ZIP archives containing XHTML content files.
+   * Reads the OPF manifest to determine reading order, then extracts
+   * text from each content document in sequence.
+   */
+  private async processEPUBFile(fileBuffer: Buffer): Promise<string> {
+    const zip = await JSZip.loadAsync(fileBuffer)
+
+    // Read container.xml to find the OPF file path
+    const containerXml = await zip.file('META-INF/container.xml')?.async('text')
+    if (!containerXml) {
+      throw new Error('Invalid EPUB: missing META-INF/container.xml')
+    }
+
+    // Parse container.xml to get the OPF rootfile path
+    const $container = cheerio.load(containerXml, { xml: true })
+    const opfPath = $container('rootfile').attr('full-path')
+    if (!opfPath) {
+      throw new Error('Invalid EPUB: no rootfile found in container.xml')
+    }
+
+    // Determine the base directory of the OPF file for resolving relative paths
+    const opfDir = opfPath.includes('/') ? opfPath.substring(0, opfPath.lastIndexOf('/') + 1) : ''
+
+    // Read and parse the OPF file
+    const opfContent = await zip.file(opfPath)?.async('text')
+    if (!opfContent) {
+      throw new Error(`Invalid EPUB: OPF file not found at ${opfPath}`)
+    }
+
+    const $opf = cheerio.load(opfContent, { xml: true })
+
+    // Build a map of manifest items (id -> href)
+    const manifestItems = new Map<string, string>()
+    $opf('manifest item').each((_, el) => {
+      const id = $opf(el).attr('id')
+      const href = $opf(el).attr('href')
+      const mediaType = $opf(el).attr('media-type') || ''
+      // Only include XHTML/HTML content documents
+      if (id && href && (mediaType.includes('html') || mediaType.includes('xml'))) {
+        manifestItems.set(id, href)
+      }
+    })
+
+    // Get the reading order from the spine
+    const spineOrder: string[] = []
+    $opf('spine itemref').each((_, el) => {
+      const idref = $opf(el).attr('idref')
+      if (idref && manifestItems.has(idref)) {
+        spineOrder.push(manifestItems.get(idref)!)
+      }
+    })
+
+    // If no spine found, fall back to all manifest items
+    const contentFiles = spineOrder.length > 0
+      ? spineOrder
+      : Array.from(manifestItems.values())
+
+    // Extract text from each content file in order
+    const textParts: string[] = []
+    for (const href of contentFiles) {
+      const fullPath = opfDir + href
+      const content = await zip.file(fullPath)?.async('text')
+      if (content) {
+        const $ = cheerio.load(content)
+        // Remove script and style elements
+        $('script, style').remove()
+        const text = $('body').text().trim()
+        if (text) {
+          textParts.push(text)
+        }
+      }
+    }
+
+    const fullText = textParts.join('\n\n')
+    logger.debug(`[RAG] EPUB extracted ${textParts.length} chapters, ${fullText.length} characters total`)
+    return fullText
+  }
+
   private async embedTextAndCleanup(
     extractedText: string,
     filepath: string,
@@ -638,6 +720,9 @@ export class RagService {
         case 'pdf':
           extractedText = await this.processPDFFile(fileBuffer!)
           break
+        case 'epub':
+          extractedText = await this.processEPUBFile(fileBuffer!)
+          break
         case 'text':
         default:
           extractedText = await this.processTextFile(fileBuffer!)
diff --git a/admin/app/utils/fs.ts b/admin/app/utils/fs.ts
index 7cc3ba8..ecba222 100644
--- a/admin/app/utils/fs.ts
+++ b/admin/app/utils/fs.ts
@@ -152,7 +152,7 @@ export function matchesDevice(fsPath: string, deviceName: string): boolean {
   return false
 }
 
-export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'zim' | 'unknown' {
+export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'epub' | 'zim' | 'unknown' {
   const ext = path.extname(filename).toLowerCase()
   if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) {
     return 'image'
@@ -160,6 +160,8 @@ export function determineFileType(filename: string): 'image' | 'pdf' | 'text' |
     return 'pdf'
   } else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) {
     return 'text'
+  } else if (ext === '.epub') {
+    return 'epub'
   } else if (ext === '.zim') {
     return 'zim'
   } else {
diff --git a/admin/package.json b/admin/package.json
index 5ad1a60..2fd46c7 100644
--- a/admin/package.json
+++ b/admin/package.json
@@ -121,7 +121,8 @@
     "tar": "^7.5.10",
     "tesseract.js": "^7.0.0",
     "url-join": "^5.0.0",
-    "yaml": "^2.8.0"
+    "yaml": "^2.8.0",
+    "jszip": "^3.10.1"
   },
   "hotHook": {
     "boundaries": [