mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-04-04 15:56:16 +02:00
feat(rag): add EPUB file support for Knowledge Base uploads
EPUBs are ZIP archives containing structured XHTML content with semantic chapter/section markup, making them well-suited for RAG text extraction and chunking. Changes: - Add 'epub' to determineFileType() in utils/fs.ts - Add processEPUBFile() method in rag_service.ts that: - Reads container.xml to locate the OPF manifest - Parses the OPF spine for correct reading order - Extracts text from each XHTML content document using cheerio - Falls back to all manifest items if no spine is found - Wire epub case into processAndEmbedFile() switch - Add jszip dependency for ZIP archive reading (cheerio already present) Closes #253-adjacent (epub is a common format for Project Gutenberg content and technical reference books)
This commit is contained in:
parent
db22b0c5f6
commit
dc7abfd41a
|
|
@ -8,6 +8,8 @@ import { deleteFileIfExists, determineFileType, getFile, getFileStatsIfExists, l
|
||||||
import { PDFParse } from 'pdf-parse'
|
import { PDFParse } from 'pdf-parse'
|
||||||
import { createWorker } from 'tesseract.js'
|
import { createWorker } from 'tesseract.js'
|
||||||
import { fromBuffer } from 'pdf2pic'
|
import { fromBuffer } from 'pdf2pic'
|
||||||
|
import JSZip from 'jszip'
|
||||||
|
import * as cheerio from 'cheerio'
|
||||||
import { OllamaService } from './ollama_service.js'
|
import { OllamaService } from './ollama_service.js'
|
||||||
import { SERVICE_NAMES } from '../../constants/service_names.js'
|
import { SERVICE_NAMES } from '../../constants/service_names.js'
|
||||||
import { removeStopwords } from 'stopword'
|
import { removeStopwords } from 'stopword'
|
||||||
|
|
@ -564,6 +566,86 @@ export class RagService {
|
||||||
return await this.extractTXTText(fileBuffer)
|
return await this.extractTXTText(fileBuffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text content from an EPUB file.
|
||||||
|
* EPUBs are ZIP archives containing XHTML content files.
|
||||||
|
* Reads the OPF manifest to determine reading order, then extracts
|
||||||
|
* text from each content document in sequence.
|
||||||
|
*/
|
||||||
|
private async processEPUBFile(fileBuffer: Buffer): Promise<string> {
|
||||||
|
const zip = await JSZip.loadAsync(fileBuffer)
|
||||||
|
|
||||||
|
// Read container.xml to find the OPF file path
|
||||||
|
const containerXml = await zip.file('META-INF/container.xml')?.async('text')
|
||||||
|
if (!containerXml) {
|
||||||
|
throw new Error('Invalid EPUB: missing META-INF/container.xml')
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse container.xml to get the OPF rootfile path
|
||||||
|
const $container = cheerio.load(containerXml, { xml: true })
|
||||||
|
const opfPath = $container('rootfile').attr('full-path')
|
||||||
|
if (!opfPath) {
|
||||||
|
throw new Error('Invalid EPUB: no rootfile found in container.xml')
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine the base directory of the OPF file for resolving relative paths
|
||||||
|
const opfDir = opfPath.includes('/') ? opfPath.substring(0, opfPath.lastIndexOf('/') + 1) : ''
|
||||||
|
|
||||||
|
// Read and parse the OPF file
|
||||||
|
const opfContent = await zip.file(opfPath)?.async('text')
|
||||||
|
if (!opfContent) {
|
||||||
|
throw new Error(`Invalid EPUB: OPF file not found at ${opfPath}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const $opf = cheerio.load(opfContent, { xml: true })
|
||||||
|
|
||||||
|
// Build a map of manifest items (id -> href)
|
||||||
|
const manifestItems = new Map<string, string>()
|
||||||
|
$opf('manifest item').each((_, el) => {
|
||||||
|
const id = $opf(el).attr('id')
|
||||||
|
const href = $opf(el).attr('href')
|
||||||
|
const mediaType = $opf(el).attr('media-type') || ''
|
||||||
|
// Only include XHTML/HTML content documents
|
||||||
|
if (id && href && (mediaType.includes('html') || mediaType.includes('xml'))) {
|
||||||
|
manifestItems.set(id, href)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// Get the reading order from the spine
|
||||||
|
const spineOrder: string[] = []
|
||||||
|
$opf('spine itemref').each((_, el) => {
|
||||||
|
const idref = $opf(el).attr('idref')
|
||||||
|
if (idref && manifestItems.has(idref)) {
|
||||||
|
spineOrder.push(manifestItems.get(idref)!)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// If no spine found, fall back to all manifest items
|
||||||
|
const contentFiles = spineOrder.length > 0
|
||||||
|
? spineOrder
|
||||||
|
: Array.from(manifestItems.values())
|
||||||
|
|
||||||
|
// Extract text from each content file in order
|
||||||
|
const textParts: string[] = []
|
||||||
|
for (const href of contentFiles) {
|
||||||
|
const fullPath = opfDir + href
|
||||||
|
const content = await zip.file(fullPath)?.async('text')
|
||||||
|
if (content) {
|
||||||
|
const $ = cheerio.load(content)
|
||||||
|
// Remove script and style elements
|
||||||
|
$('script, style').remove()
|
||||||
|
const text = $('body').text().trim()
|
||||||
|
if (text) {
|
||||||
|
textParts.push(text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const fullText = textParts.join('\n\n')
|
||||||
|
logger.debug(`[RAG] EPUB extracted ${textParts.length} chapters, ${fullText.length} characters total`)
|
||||||
|
return fullText
|
||||||
|
}
|
||||||
|
|
||||||
private async embedTextAndCleanup(
|
private async embedTextAndCleanup(
|
||||||
extractedText: string,
|
extractedText: string,
|
||||||
filepath: string,
|
filepath: string,
|
||||||
|
|
@ -638,6 +720,9 @@ export class RagService {
|
||||||
case 'pdf':
|
case 'pdf':
|
||||||
extractedText = await this.processPDFFile(fileBuffer!)
|
extractedText = await this.processPDFFile(fileBuffer!)
|
||||||
break
|
break
|
||||||
|
case 'epub':
|
||||||
|
extractedText = await this.processEPUBFile(fileBuffer!)
|
||||||
|
break
|
||||||
case 'text':
|
case 'text':
|
||||||
default:
|
default:
|
||||||
extractedText = await this.processTextFile(fileBuffer!)
|
extractedText = await this.processTextFile(fileBuffer!)
|
||||||
|
|
|
||||||
|
|
@ -152,7 +152,7 @@ export function matchesDevice(fsPath: string, deviceName: string): boolean {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'zim' | 'unknown' {
|
export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'epub' | 'zim' | 'unknown' {
|
||||||
const ext = path.extname(filename).toLowerCase()
|
const ext = path.extname(filename).toLowerCase()
|
||||||
if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) {
|
if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) {
|
||||||
return 'image'
|
return 'image'
|
||||||
|
|
@ -160,6 +160,8 @@ export function determineFileType(filename: string): 'image' | 'pdf' | 'text' |
|
||||||
return 'pdf'
|
return 'pdf'
|
||||||
} else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) {
|
} else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) {
|
||||||
return 'text'
|
return 'text'
|
||||||
|
} else if (ext === '.epub') {
|
||||||
|
return 'epub'
|
||||||
} else if (ext === '.zim') {
|
} else if (ext === '.zim') {
|
||||||
return 'zim'
|
return 'zim'
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -121,7 +121,8 @@
|
||||||
"tar": "^7.5.10",
|
"tar": "^7.5.10",
|
||||||
"tesseract.js": "^7.0.0",
|
"tesseract.js": "^7.0.0",
|
||||||
"url-join": "^5.0.0",
|
"url-join": "^5.0.0",
|
||||||
"yaml": "^2.8.0"
|
"yaml": "^2.8.0",
|
||||||
|
"jszip": "^3.10.1"
|
||||||
},
|
},
|
||||||
"hotHook": {
|
"hotHook": {
|
||||||
"boundaries": [
|
"boundaries": [
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user