From dc7abfd41aca53c4e81537051720d8f80824e295 Mon Sep 17 00:00:00 2001 From: brian Date: Fri, 13 Mar 2026 13:25:01 -0400 Subject: [PATCH] feat(rag): add EPUB file support for Knowledge Base uploads EPUBs are ZIP archives containing structured XHTML content with semantic chapter/section markup, making them well-suited for RAG text extraction and chunking. Changes: - Add 'epub' to determineFileType() in utils/fs.ts - Add processEPUBFile() method in rag_service.ts that: - Reads container.xml to locate the OPF manifest - Parses the OPF spine for correct reading order - Extracts text from each XHTML content document using cheerio - Falls back to all manifest items if no spine is found - Wire epub case into processAndEmbedFile() switch - Add jszip dependency for ZIP archive reading (cheerio already present) Closes #253-adjacent (epub is a common format for Project Gutenberg content and technical reference books) --- admin/app/services/rag_service.ts | 85 +++++++++++++++++++++++++++++++ admin/app/utils/fs.ts | 4 +- admin/package.json | 3 +- 3 files changed, 90 insertions(+), 2 deletions(-) diff --git a/admin/app/services/rag_service.ts b/admin/app/services/rag_service.ts index e6ac043..5f003d9 100644 --- a/admin/app/services/rag_service.ts +++ b/admin/app/services/rag_service.ts @@ -8,6 +8,8 @@ import { deleteFileIfExists, determineFileType, getFile, getFileStatsIfExists, l import { PDFParse } from 'pdf-parse' import { createWorker } from 'tesseract.js' import { fromBuffer } from 'pdf2pic' +import JSZip from 'jszip' +import * as cheerio from 'cheerio' import { OllamaService } from './ollama_service.js' import { SERVICE_NAMES } from '../../constants/service_names.js' import { removeStopwords } from 'stopword' @@ -564,6 +566,86 @@ export class RagService { return await this.extractTXTText(fileBuffer) } + /** + * Extract text content from an EPUB file. + * EPUBs are ZIP archives containing XHTML content files. + * Reads the OPF manifest to determine reading order, then extracts + * text from each content document in sequence. + */ + private async processEPUBFile(fileBuffer: Buffer): Promise { + const zip = await JSZip.loadAsync(fileBuffer) + + // Read container.xml to find the OPF file path + const containerXml = await zip.file('META-INF/container.xml')?.async('text') + if (!containerXml) { + throw new Error('Invalid EPUB: missing META-INF/container.xml') + } + + // Parse container.xml to get the OPF rootfile path + const $container = cheerio.load(containerXml, { xml: true }) + const opfPath = $container('rootfile').attr('full-path') + if (!opfPath) { + throw new Error('Invalid EPUB: no rootfile found in container.xml') + } + + // Determine the base directory of the OPF file for resolving relative paths + const opfDir = opfPath.includes('/') ? opfPath.substring(0, opfPath.lastIndexOf('/') + 1) : '' + + // Read and parse the OPF file + const opfContent = await zip.file(opfPath)?.async('text') + if (!opfContent) { + throw new Error(`Invalid EPUB: OPF file not found at ${opfPath}`) + } + + const $opf = cheerio.load(opfContent, { xml: true }) + + // Build a map of manifest items (id -> href) + const manifestItems = new Map() + $opf('manifest item').each((_, el) => { + const id = $opf(el).attr('id') + const href = $opf(el).attr('href') + const mediaType = $opf(el).attr('media-type') || '' + // Only include XHTML/HTML content documents + if (id && href && (mediaType.includes('html') || mediaType.includes('xml'))) { + manifestItems.set(id, href) + } + }) + + // Get the reading order from the spine + const spineOrder: string[] = [] + $opf('spine itemref').each((_, el) => { + const idref = $opf(el).attr('idref') + if (idref && manifestItems.has(idref)) { + spineOrder.push(manifestItems.get(idref)!) + } + }) + + // If no spine found, fall back to all manifest items + const contentFiles = spineOrder.length > 0 + ? spineOrder + : Array.from(manifestItems.values()) + + // Extract text from each content file in order + const textParts: string[] = [] + for (const href of contentFiles) { + const fullPath = opfDir + href + const content = await zip.file(fullPath)?.async('text') + if (content) { + const $ = cheerio.load(content) + // Remove script and style elements + $('script, style').remove() + const text = $('body').text().trim() + if (text) { + textParts.push(text) + } + } + } + + const fullText = textParts.join('\n\n') + logger.debug(`[RAG] EPUB extracted ${textParts.length} chapters, ${fullText.length} characters total`) + return fullText + } + private async embedTextAndCleanup( extractedText: string, filepath: string, @@ -638,6 +720,9 @@ export class RagService { case 'pdf': extractedText = await this.processPDFFile(fileBuffer!) break + case 'epub': + extractedText = await this.processEPUBFile(fileBuffer!) + break case 'text': default: extractedText = await this.processTextFile(fileBuffer!) diff --git a/admin/app/utils/fs.ts b/admin/app/utils/fs.ts index 7cc3ba8..ecba222 100644 --- a/admin/app/utils/fs.ts +++ b/admin/app/utils/fs.ts @@ -152,7 +152,7 @@ export function matchesDevice(fsPath: string, deviceName: string): boolean { return false } -export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'zim' | 'unknown' { +export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'epub' | 'zim' | 'unknown' { const ext = path.extname(filename).toLowerCase() if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) { return 'image' @@ -160,6 +160,8 @@ export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | return 'pdf' } else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) { return 'text' + } else if (ext === '.epub') { + return 'epub' } else if (ext === '.zim') { return 'zim' } else { diff --git a/admin/package.json b/admin/package.json index 5ad1a60..2fd46c7 100644 --- a/admin/package.json +++ b/admin/package.json @@ -121,7 +121,8 @@ "tar": "^7.5.10", "tesseract.js": "^7.0.0", "url-join": "^5.0.0", - "yaml": "^2.8.0" + "yaml": "^2.8.0", + "jszip": "^3.10.1" }, "hotHook": { "boundaries": [