feat(rag): add EPUB file support for Knowledge Base uploads

EPUBs are ZIP archives containing structured XHTML content with semantic chapter/section markup, making them well-suited for RAG text extraction and chunking. Changes: - Add 'epub' to determineFileType() in utils/fs.ts - Add processEPUBFile() method in rag_service.ts that: - Reads container.xml to locate the OPF manifest - Parses the OPF spine for correct reading order - Extracts text from each XHTML content document using cheerio - Falls back to all manifest items if no spine is found - Wire epub case into processAndEmbedFile() switch - Add jszip dependency for ZIP archive reading (cheerio already present) Closes #253-adjacent (epub is a common format for Project Gutenberg content and technical reference books)
2026-04-04 15:56:16 +02:00 · 2026-03-13 13:25:01 -04:00 · 2026-03-13 13:25:01 -04:00 · dc7abfd41a
commit dc7abfd41a
parent db22b0c5f6
3 changed files with 90 additions and 2 deletions
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@ -8,6 +8,8 @@ import { deleteFileIfExists, determineFileType, getFile, getFileStatsIfExists, l
 import { PDFParse } from 'pdf-parse'
 import { createWorker } from 'tesseract.js'
 import { fromBuffer } from 'pdf2pic'
 import JSZip from 'jszip'
 import * as cheerio from 'cheerio'
 import { OllamaService } from './ollama_service.js'
 import { SERVICE_NAMES } from '../../constants/service_names.js'
 import { removeStopwords } from 'stopword'
@ -564,6 +566,86 @@ export class RagService {
    return await this.extractTXTText(fileBuffer)
  }
  /**
   * Extract text content from an EPUB file.
   * EPUBs are ZIP archives containing XHTML content files.
   * Reads the OPF manifest to determine reading order, then extracts
   * text from each content document in sequence.
   */
  private async processEPUBFile(fileBuffer: Buffer): Promise<string> {
    const zip = await JSZip.loadAsync(fileBuffer)
    // Read container.xml to find the OPF file path
    const containerXml = await zip.file('META-INF/container.xml')?.async('text')
    if (!containerXml) {
      throw new Error('Invalid EPUB: missing META-INF/container.xml')
    }
    // Parse container.xml to get the OPF rootfile path
    const $container = cheerio.load(containerXml, { xml: true })
    const opfPath = $container('rootfile').attr('full-path')
    if (!opfPath) {
      throw new Error('Invalid EPUB: no rootfile found in container.xml')
    }
    // Determine the base directory of the OPF file for resolving relative paths
    const opfDir = opfPath.includes('/') ? opfPath.substring(0, opfPath.lastIndexOf('/') + 1) : ''
    // Read and parse the OPF file
    const opfContent = await zip.file(opfPath)?.async('text')
    if (!opfContent) {
      throw new Error(`Invalid EPUB: OPF file not found at ${opfPath}`)
    }
    const $opf = cheerio.load(opfContent, { xml: true })
    // Build a map of manifest items (id -> href)
    const manifestItems = new Map<string, string>()
    $opf('manifest item').each((_, el) => {
      const id = $opf(el).attr('id')
      const href = $opf(el).attr('href')
      const mediaType = $opf(el).attr('media-type') || ''
      // Only include XHTML/HTML content documents
      if (id && href && (mediaType.includes('html') || mediaType.includes('xml'))) {
        manifestItems.set(id, href)
      }
    })
    // Get the reading order from the spine
    const spineOrder: string[] = []
    $opf('spine itemref').each((_, el) => {
      const idref = $opf(el).attr('idref')
      if (idref && manifestItems.has(idref)) {
        spineOrder.push(manifestItems.get(idref)!)
      }
    })
    // If no spine found, fall back to all manifest items
    const contentFiles = spineOrder.length > 0
      ? spineOrder
      : Array.from(manifestItems.values())
    // Extract text from each content file in order
    const textParts: string[] = []
    for (const href of contentFiles) {
      const fullPath = opfDir + href
      const content = await zip.file(fullPath)?.async('text')
      if (content) {
        const $ = cheerio.load(content)
        // Remove script and style elements
        $('script, style').remove()
        const text = $('body').text().trim()
        if (text) {
          textParts.push(text)
        }
      }
    }
    const fullText = textParts.join('\n\n')
    logger.debug(`[RAG] EPUB extracted ${textParts.length} chapters, ${fullText.length} characters total`)
    return fullText
  }
  private async embedTextAndCleanup(
    extractedText: string,
    filepath: string,
@ -638,6 +720,9 @@ export class RagService {
        case 'pdf':
          extractedText = await this.processPDFFile(fileBuffer!)
          break
        case 'epub':
          extractedText = await this.processEPUBFile(fileBuffer!)
          break
        case 'text':
        default:
          extractedText = await this.processTextFile(fileBuffer!)
--- a/admin/app/utils/fs.ts
+++ b/admin/app/utils/fs.ts
@ -152,7 +152,7 @@ export function matchesDevice(fsPath: string, deviceName: string): boolean {
  return false
 }
-export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'zim' | 'unknown' {
+export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'epub' | 'zim' | 'unknown' {
  const ext = path.extname(filename).toLowerCase()
  if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) {
    return 'image'
@ -160,6 +160,8 @@ export function determineFileType(filename: string): 'image' | 'pdf' | 'text' |
    return 'pdf'
  } else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) {
    return 'text'
  } else if (ext === '.epub') {
    return 'epub'
  } else if (ext === '.zim') {
    return 'zim'
  } else {
--- a/admin/package.json
+++ b/admin/package.json
@ -121,7 +121,8 @@
    "tar": "^7.5.10",
    "tesseract.js": "^7.0.0",
    "url-join": "^5.0.0",
-    "yaml": "^2.8.0"
+    "yaml": "^2.8.0",
    "jszip": "^3.10.1"
  },
  "hotHook": {
    "boundaries": [