mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-03-28 03:29:25 +01:00
feat: zim content embedding
This commit is contained in:
parent
c2b6e079af
commit
8726700a0a
|
|
@ -5,7 +5,6 @@ import type { HttpContext } from '@adonisjs/core/http'
|
|||
import app from '@adonisjs/core/services/app'
|
||||
import { randomBytes } from 'node:crypto'
|
||||
import { sanitizeFilename } from '../utils/fs.js'
|
||||
import { stat } from 'node:fs/promises'
|
||||
import { getJobStatusSchema } from '#validators/rag'
|
||||
|
||||
@inject()
|
||||
|
|
@ -28,20 +27,10 @@ export default class RagController {
|
|||
name: fileName,
|
||||
})
|
||||
|
||||
// Get file size for tracking
|
||||
let fileSize: number | undefined = undefined
|
||||
try {
|
||||
const stats = await stat(fullPath)
|
||||
fileSize = stats.size
|
||||
} catch (error) {
|
||||
// Not critical if we can't get file size, just swallow the error
|
||||
}
|
||||
|
||||
// Dispatch background job for embedding
|
||||
const result = await EmbedFileJob.dispatch({
|
||||
filePath: fullPath,
|
||||
fileName,
|
||||
fileSize,
|
||||
})
|
||||
|
||||
return response.status(202).json({
|
||||
|
|
|
|||
|
|
@ -10,6 +10,10 @@ export interface EmbedFileJobParams {
|
|||
filePath: string
|
||||
fileName: string
|
||||
fileSize?: number
|
||||
// Batch processing for large ZIM files
|
||||
batchOffset?: number // Current batch offset (for ZIM files)
|
||||
totalArticles?: number // Total articles in ZIM (for progress tracking)
|
||||
isFinalBatch?: boolean // Whether this is the last batch (prevents premature deletion)
|
||||
}
|
||||
|
||||
export class EmbedFileJob {
|
||||
|
|
@ -26,9 +30,11 @@ export class EmbedFileJob {
|
|||
}
|
||||
|
||||
async handle(job: Job) {
|
||||
const { filePath, fileName } = job.data as EmbedFileJobParams
|
||||
const { filePath, fileName, batchOffset, totalArticles } = job.data as EmbedFileJobParams
|
||||
|
||||
logger.info(`[EmbedFileJob] Starting embedding process for: ${fileName}`)
|
||||
const isZimBatch = batchOffset !== undefined
|
||||
const batchInfo = isZimBatch ? ` (batch offset: ${batchOffset})` : ''
|
||||
logger.info(`[EmbedFileJob] Starting embedding process for: ${fileName}${batchInfo}`)
|
||||
|
||||
const dockerService = new DockerService()
|
||||
const ollamaService = new OllamaService()
|
||||
|
|
@ -55,30 +61,78 @@ export class EmbedFileJob {
|
|||
await job.updateData({
|
||||
...job.data,
|
||||
status: 'processing',
|
||||
startedAt: Date.now(),
|
||||
startedAt: job.data.startedAt || Date.now(),
|
||||
})
|
||||
|
||||
logger.info(`[EmbedFileJob] Processing file: ${filePath}`)
|
||||
|
||||
// Process and embed the file
|
||||
const result = await ragService.processAndEmbedFile(filePath)
|
||||
// Only allow deletion if explicitly marked as final batch
|
||||
const allowDeletion = job.data.isFinalBatch === true
|
||||
const result = await ragService.processAndEmbedFile(
|
||||
filePath,
|
||||
allowDeletion,
|
||||
batchOffset
|
||||
)
|
||||
|
||||
if (!result.success) {
|
||||
logger.error(`[EmbedFileJob] Failed to process file ${fileName}: ${result.message}`)
|
||||
throw new Error(result.message)
|
||||
}
|
||||
|
||||
// Update progress complete
|
||||
// For ZIM files with batching, check if more batches are needed
|
||||
if (result.hasMoreBatches) {
|
||||
const nextOffset = (batchOffset || 0) + (result.articlesProcessed || 0)
|
||||
logger.info(
|
||||
`[EmbedFileJob] Batch complete. Dispatching next batch at offset ${nextOffset}`
|
||||
)
|
||||
|
||||
// Dispatch next batch (not final yet)
|
||||
await EmbedFileJob.dispatch({
|
||||
filePath,
|
||||
fileName,
|
||||
batchOffset: nextOffset,
|
||||
totalArticles: totalArticles || result.totalArticles,
|
||||
isFinalBatch: false, // Explicitly not final
|
||||
})
|
||||
|
||||
// Calculate progress based on articles processed
|
||||
const progress = totalArticles
|
||||
? Math.round((nextOffset / totalArticles) * 100)
|
||||
: 50
|
||||
|
||||
await job.updateProgress(progress)
|
||||
await job.updateData({
|
||||
...job.data,
|
||||
status: 'batch_completed',
|
||||
lastBatchAt: Date.now(),
|
||||
chunks: (job.data.chunks || 0) + (result.chunks || 0),
|
||||
})
|
||||
|
||||
return {
|
||||
success: true,
|
||||
fileName,
|
||||
filePath,
|
||||
chunks: result.chunks,
|
||||
hasMoreBatches: true,
|
||||
nextOffset,
|
||||
message: `Batch embedded ${result.chunks} chunks, next batch queued`,
|
||||
}
|
||||
}
|
||||
|
||||
// Final batch or non-batched file - mark as complete
|
||||
const totalChunks = (job.data.chunks || 0) + (result.chunks || 0)
|
||||
await job.updateProgress(100)
|
||||
await job.updateData({
|
||||
...job.data,
|
||||
status: 'completed',
|
||||
completedAt: Date.now(),
|
||||
chunks: result.chunks,
|
||||
chunks: totalChunks,
|
||||
})
|
||||
|
||||
const batchMsg = isZimBatch ? ` (final batch, total chunks: ${totalChunks})` : ''
|
||||
logger.info(
|
||||
`[EmbedFileJob] Successfully embedded ${result.chunks} chunks from file: ${fileName}`
|
||||
`[EmbedFileJob] Successfully embedded ${result.chunks} chunks from file: ${fileName}${batchMsg}`
|
||||
)
|
||||
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import { createHash } from 'crypto'
|
|||
import { DockerService } from '#services/docker_service'
|
||||
import { ZimService } from '#services/zim_service'
|
||||
import { MapService } from '#services/map_service'
|
||||
import { EmbedFileJob } from './embed_file_job.js'
|
||||
|
||||
export class RunDownloadJob {
|
||||
static get queue() {
|
||||
|
|
@ -24,17 +25,6 @@ export class RunDownloadJob {
|
|||
const { url, filepath, timeout, allowedMimeTypes, forceNew, filetype } =
|
||||
job.data as RunDownloadJobParams
|
||||
|
||||
// console.log("Simulating delay for job for URL:", url)
|
||||
// await new Promise((resolve) => setTimeout(resolve, 30000)) // Simulate initial delay
|
||||
// console.log("Starting download for URL:", url)
|
||||
|
||||
// // simulate progress updates for demonstration
|
||||
// for (let progress = 0; progress <= 100; progress += 10) {
|
||||
// await new Promise((resolve) => setTimeout(resolve, 20000)) // Simulate time taken for each progress step
|
||||
// job.updateProgress(progress)
|
||||
// console.log(`Job progress for URL ${url}: ${progress}%`)
|
||||
// }
|
||||
|
||||
await doResumableDownload({
|
||||
url,
|
||||
filepath,
|
||||
|
|
@ -51,6 +41,16 @@ export class RunDownloadJob {
|
|||
const dockerService = new DockerService()
|
||||
const zimService = new ZimService(dockerService)
|
||||
await zimService.downloadRemoteSuccessCallback([url], true)
|
||||
|
||||
// Dispatch an embedding job for the downloaded ZIM file
|
||||
try {
|
||||
await EmbedFileJob.dispatch({
|
||||
fileName: url.split('/').pop() || '',
|
||||
filePath: filepath,
|
||||
})
|
||||
} catch (error) {
|
||||
console.error(`[RunDownloadJob] Error dispatching EmbedFileJob for URL ${url}:`, error)
|
||||
}
|
||||
} else if (filetype === 'map') {
|
||||
const mapsService = new MapService()
|
||||
await mapsService.downloadRemoteSuccessCallback([url], false)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import { streamToString } from '../../util/docs.js'
|
|||
import { getFile, getFileStatsIfExists, listDirectoryContentsRecursive } from '../utils/fs.js'
|
||||
import path from 'path'
|
||||
import InternalServerErrorException from '#exceptions/internal_server_error_exception'
|
||||
import logger from '@adonisjs/core/services/logger'
|
||||
|
||||
export class DocsService {
|
||||
private docsPath = path.join(process.cwd(), 'docs')
|
||||
|
|
@ -46,13 +47,13 @@ export class DocsService {
|
|||
// Filter out attribute-undefined errors which may be caused by emojis and special characters
|
||||
const criticalErrors = errors.filter((e) => e.error.id !== 'attribute-undefined')
|
||||
if (criticalErrors.length > 0) {
|
||||
console.error('Markdoc validation errors:', errors.map((e) => JSON.stringify(e.error)).join(', '))
|
||||
logger.error('Markdoc validation errors:', errors.map((e) => JSON.stringify(e.error)).join(', '))
|
||||
throw new Error('Markdoc validation failed')
|
||||
}
|
||||
|
||||
return Markdoc.transform(ast, config)
|
||||
} catch (error) {
|
||||
console.log('Error parsing Markdoc content:', error)
|
||||
logger.error('Error parsing Markdoc content:', error)
|
||||
throw new InternalServerErrorException(`Error parsing content: ${(error as Error).message}`)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,8 @@ import { randomUUID } from 'node:crypto'
|
|||
import { join } from 'node:path'
|
||||
import KVStore from '#models/kv_store'
|
||||
import { parseBoolean } from '../utils/misc.js'
|
||||
import { ZIMExtractionService } from './zim_extraction_service.js'
|
||||
import { ZIM_BATCH_SIZE } from '../../constants/zim_extraction.js'
|
||||
|
||||
@inject()
|
||||
export class RagService {
|
||||
|
|
@ -38,6 +40,67 @@ export class RagService {
|
|||
private ollamaService: OllamaService
|
||||
) { }
|
||||
|
||||
private async _initializeQdrantClient() {
|
||||
if (!this.qdrantInitPromise) {
|
||||
this.qdrantInitPromise = (async () => {
|
||||
const qdrantUrl = await this.dockerService.getServiceURL(SERVICE_NAMES.QDRANT)
|
||||
if (!qdrantUrl) {
|
||||
throw new Error('Qdrant service is not installed or running.')
|
||||
}
|
||||
this.qdrant = new QdrantClient({ url: qdrantUrl })
|
||||
})()
|
||||
}
|
||||
return this.qdrantInitPromise
|
||||
}
|
||||
|
||||
private async _ensureDependencies() {
|
||||
if (!this.qdrant) {
|
||||
await this._initializeQdrantClient()
|
||||
}
|
||||
}
|
||||
|
||||
private async _ensureCollection(
|
||||
collectionName: string,
|
||||
dimensions: number = RagService.EMBEDDING_DIMENSION
|
||||
) {
|
||||
try {
|
||||
await this._ensureDependencies()
|
||||
const collections = await this.qdrant!.getCollections()
|
||||
const collectionExists = collections.collections.some((col) => col.name === collectionName)
|
||||
|
||||
if (!collectionExists) {
|
||||
await this.qdrant!.createCollection(collectionName, {
|
||||
vectors: {
|
||||
size: dimensions,
|
||||
distance: 'Cosine',
|
||||
},
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error ensuring Qdrant collection:', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitizes text to ensure it's safe for JSON encoding and Qdrant storage.
|
||||
* Removes problematic characters that can cause "unexpected end of hex escape" errors:
|
||||
* - Null bytes (\x00)
|
||||
* - Invalid Unicode sequences
|
||||
* - Control characters (except newlines, tabs, and carriage returns)
|
||||
*/
|
||||
private sanitizeText(text: string): string {
|
||||
return text
|
||||
// Null bytes
|
||||
.replace(/\x00/g, '')
|
||||
// Problematic control characters (keep \n, \r, \t)
|
||||
.replace(/[\x01-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, '')
|
||||
// Invalid Unicode surrogates
|
||||
.replace(/[\uD800-\uDFFF]/g, '')
|
||||
// Trim extra whitespace
|
||||
.trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimates token count for text. This is a conservative approximation:
|
||||
* - English text: ~1 token per 3 characters
|
||||
|
|
@ -114,48 +177,6 @@ export class RagService {
|
|||
return [...new Set(keywords)]
|
||||
}
|
||||
|
||||
private async _initializeQdrantClient() {
|
||||
if (!this.qdrantInitPromise) {
|
||||
this.qdrantInitPromise = (async () => {
|
||||
const qdrantUrl = await this.dockerService.getServiceURL(SERVICE_NAMES.QDRANT)
|
||||
if (!qdrantUrl) {
|
||||
throw new Error('Qdrant service is not installed or running.')
|
||||
}
|
||||
this.qdrant = new QdrantClient({ url: qdrantUrl })
|
||||
})()
|
||||
}
|
||||
return this.qdrantInitPromise
|
||||
}
|
||||
|
||||
private async _ensureDependencies() {
|
||||
if (!this.qdrant) {
|
||||
await this._initializeQdrantClient()
|
||||
}
|
||||
}
|
||||
|
||||
private async _ensureCollection(
|
||||
collectionName: string,
|
||||
dimensions: number = RagService.EMBEDDING_DIMENSION
|
||||
) {
|
||||
try {
|
||||
await this._ensureDependencies()
|
||||
const collections = await this.qdrant!.getCollections()
|
||||
const collectionExists = collections.collections.some((col) => col.name === collectionName)
|
||||
|
||||
if (!collectionExists) {
|
||||
await this.qdrant!.createCollection(collectionName, {
|
||||
vectors: {
|
||||
size: dimensions,
|
||||
distance: 'Cosine',
|
||||
},
|
||||
})
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error ensuring Qdrant collection:', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
public async embedAndStoreText(
|
||||
text: string,
|
||||
metadata: Record<string, any> = {}
|
||||
|
|
@ -237,21 +258,45 @@ export class RagService {
|
|||
|
||||
const timestamp = Date.now()
|
||||
const points = chunks.map((chunkText, index) => {
|
||||
// Extract keywords for hybrid search
|
||||
const keywords = this.extractKeywords(chunkText)
|
||||
logger.debug(`[RAG] Extracted keywords for chunk ${index}: [${keywords.join(', ')}]`)
|
||||
// Sanitize text to prevent JSON encoding errors
|
||||
const sanitizedText = this.sanitizeText(chunkText)
|
||||
|
||||
// Extract keywords from content
|
||||
const contentKeywords = this.extractKeywords(sanitizedText)
|
||||
|
||||
// For ZIM content, also extract keywords from structural metadata
|
||||
let structuralKeywords: string[] = []
|
||||
if (metadata.full_title) {
|
||||
structuralKeywords = this.extractKeywords(metadata.full_title as string)
|
||||
} else if (metadata.article_title) {
|
||||
structuralKeywords = this.extractKeywords(metadata.article_title as string)
|
||||
}
|
||||
|
||||
// Combine and dedup keywords
|
||||
const allKeywords = [...new Set([...structuralKeywords, ...contentKeywords])]
|
||||
|
||||
logger.debug(`[RAG] Extracted keywords for chunk ${index}: [${allKeywords.join(', ')}]`)
|
||||
if (structuralKeywords.length > 0) {
|
||||
logger.debug(`[RAG] - Structural: [${structuralKeywords.join(', ')}], Content: [${contentKeywords.join(', ')}]`)
|
||||
}
|
||||
|
||||
// Sanitize source metadata as well
|
||||
const sanitizedSource = typeof metadata.source === 'string'
|
||||
? this.sanitizeText(metadata.source)
|
||||
: 'unknown'
|
||||
|
||||
return {
|
||||
id: randomUUID(), // qdrant requires either uuid or unsigned int
|
||||
vector: embeddings[index],
|
||||
payload: {
|
||||
...metadata,
|
||||
text: chunkText,
|
||||
text: sanitizedText,
|
||||
chunk_index: index,
|
||||
total_chunks: chunks.length,
|
||||
keywords: keywords.join(' '), // Store as space-separated string for text search
|
||||
char_count: chunkText.length,
|
||||
keywords: allKeywords.join(' '), // store as space-separated string for text search
|
||||
char_count: sanitizedText.length,
|
||||
created_at: timestamp,
|
||||
source: metadata.source || 'unknown'
|
||||
source: sanitizedSource
|
||||
},
|
||||
}
|
||||
})
|
||||
|
|
@ -269,12 +314,6 @@ export class RagService {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Preprocess an image to enhance text extraction quality.
|
||||
* Normalizes, grayscales, sharpens, and resizes the image to a manageable size.
|
||||
* @param filebuffer Buffer of the image file
|
||||
* @returns - Processed image buffer
|
||||
*/
|
||||
private async preprocessImage(filebuffer: Buffer): Promise<Buffer> {
|
||||
return await sharp(filebuffer)
|
||||
.grayscale()
|
||||
|
|
@ -284,12 +323,6 @@ export class RagService {
|
|||
.toBuffer()
|
||||
}
|
||||
|
||||
/**
|
||||
* If the original PDF has little to no extractable text,
|
||||
* we can use this method to convert each page to an image for OCR processing.
|
||||
* @param filebuffer - Buffer of the PDF file
|
||||
* @returns - Array of image buffers, one per page
|
||||
*/
|
||||
private async convertPDFtoImages(filebuffer: Buffer): Promise<Buffer[]> {
|
||||
const converted = await fromBuffer(filebuffer, {
|
||||
quality: 50,
|
||||
|
|
@ -301,11 +334,6 @@ export class RagService {
|
|||
return converted.filter((res) => res.buffer).map((res) => res.buffer!)
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from a PDF file using pdf-parse.
|
||||
* @param filebuffer - Buffer of the PDF file
|
||||
* @returns - Extracted text
|
||||
*/
|
||||
private async extractPDFText(filebuffer: Buffer): Promise<string> {
|
||||
const parser = new PDFParse({ data: filebuffer })
|
||||
const data = await parser.getText()
|
||||
|
|
@ -313,20 +341,10 @@ export class RagService {
|
|||
return data.text
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from a plain text file.
|
||||
* @param filebuffer - Buffer of the text file
|
||||
* @returns - Extracted text
|
||||
*/
|
||||
private async extractTXTText(filebuffer: Buffer): Promise<string> {
|
||||
return filebuffer.toString('utf-8')
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from an image file using Tesseract.js OCR.
|
||||
* @param filebuffer - Buffer of the image file
|
||||
* @returns - Extracted text
|
||||
*/
|
||||
private async extractImageText(filebuffer: Buffer): Promise<string> {
|
||||
const worker = await createWorker('eng')
|
||||
const result = await worker.recognize(filebuffer)
|
||||
|
|
@ -334,71 +352,229 @@ export class RagService {
|
|||
return result.data.text
|
||||
}
|
||||
|
||||
private async processImageFile(fileBuffer: Buffer): Promise<string> {
|
||||
const preprocessedBuffer = await this.preprocessImage(fileBuffer)
|
||||
return await this.extractImageText(preprocessedBuffer)
|
||||
}
|
||||
|
||||
/**
|
||||
* Will process the PDF and attempt to extract text.
|
||||
* If the extracted text is minimal, it will fallback to OCR on each page.
|
||||
*/
|
||||
private async processPDFFile(fileBuffer: Buffer): Promise<string> {
|
||||
let extractedText = await this.extractPDFText(fileBuffer)
|
||||
|
||||
// Check if there was no extracted text or it was very minimal
|
||||
if (!extractedText || extractedText.trim().length < 100) {
|
||||
logger.debug('[RAG] PDF text extraction minimal, attempting OCR on pages')
|
||||
// Convert PDF pages to images for OCR if text extraction was poor
|
||||
const imageBuffers = await this.convertPDFtoImages(fileBuffer)
|
||||
extractedText = ''
|
||||
|
||||
for (const imgBuffer of imageBuffers) {
|
||||
const preprocessedImg = await this.preprocessImage(imgBuffer)
|
||||
const pageText = await this.extractImageText(preprocessedImg)
|
||||
extractedText += pageText + '\n'
|
||||
}
|
||||
}
|
||||
|
||||
return extractedText
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a ZIM file: extract content with metadata and embed each chunk.
|
||||
* Returns early with complete result since ZIM processing is self-contained.
|
||||
* Supports batch processing to prevent lock timeouts on large ZIM files.
|
||||
*/
|
||||
private async processZIMFile(
|
||||
filepath: string,
|
||||
deleteAfterEmbedding: boolean,
|
||||
batchOffset?: number
|
||||
): Promise<{
|
||||
success: boolean
|
||||
message: string
|
||||
chunks?: number
|
||||
hasMoreBatches?: boolean
|
||||
articlesProcessed?: number
|
||||
totalArticles?: number
|
||||
}> {
|
||||
const zimExtractionService = new ZIMExtractionService()
|
||||
|
||||
// Process in batches to avoid lock timeout
|
||||
const startOffset = batchOffset || 0
|
||||
|
||||
logger.info(
|
||||
`[RAG] Extracting ZIM content (batch: offset=${startOffset}, size=${ZIM_BATCH_SIZE})`
|
||||
)
|
||||
|
||||
const zimChunks = await zimExtractionService.extractZIMContent(filepath, {
|
||||
startOffset,
|
||||
batchSize: ZIM_BATCH_SIZE,
|
||||
})
|
||||
|
||||
logger.info(
|
||||
`[RAG] Extracted ${zimChunks.length} chunks from ZIM file with enhanced metadata`
|
||||
)
|
||||
|
||||
// Process each chunk individually with its metadata
|
||||
let totalChunks = 0
|
||||
for (const zimChunk of zimChunks) {
|
||||
const result = await this.embedAndStoreText(zimChunk.text, {
|
||||
source: filepath,
|
||||
content_type: 'zim_article',
|
||||
|
||||
// Article-level context
|
||||
article_title: zimChunk.articleTitle,
|
||||
article_path: zimChunk.articlePath,
|
||||
|
||||
// Section-level context
|
||||
section_title: zimChunk.sectionTitle,
|
||||
full_title: zimChunk.fullTitle,
|
||||
hierarchy: zimChunk.hierarchy,
|
||||
section_level: zimChunk.sectionLevel,
|
||||
|
||||
// Use the same document ID for all chunks from the same article for grouping in search results
|
||||
document_id: zimChunk.documentId,
|
||||
|
||||
// Archive metadata
|
||||
archive_title: zimChunk.archiveMetadata.title,
|
||||
archive_creator: zimChunk.archiveMetadata.creator,
|
||||
archive_publisher: zimChunk.archiveMetadata.publisher,
|
||||
archive_date: zimChunk.archiveMetadata.date,
|
||||
archive_language: zimChunk.archiveMetadata.language,
|
||||
archive_description: zimChunk.archiveMetadata.description,
|
||||
|
||||
// Extraction metadata - not overly relevant for search, but could be useful for debugging and future features...
|
||||
extraction_strategy: zimChunk.strategy,
|
||||
})
|
||||
|
||||
if (result) {
|
||||
totalChunks += result.chunks
|
||||
}
|
||||
}
|
||||
|
||||
// Count unique articles processed in this batch
|
||||
const articlesInBatch = new Set(zimChunks.map((c) => c.documentId)).size
|
||||
const hasMoreBatches = zimChunks.length === ZIM_BATCH_SIZE
|
||||
|
||||
logger.info(
|
||||
`[RAG] Successfully embedded ${totalChunks} total chunks from ${articlesInBatch} articles (hasMore: ${hasMoreBatches})`
|
||||
)
|
||||
|
||||
// Only delete the file when:
|
||||
// 1. deleteAfterEmbedding is true (caller wants deletion)
|
||||
// 2. No more batches remain (this is the final batch)
|
||||
// This prevents race conditions where early batches complete after later ones
|
||||
const shouldDelete = deleteAfterEmbedding && !hasMoreBatches
|
||||
if (shouldDelete) {
|
||||
logger.info(`[RAG] Final batch complete, deleting ZIM file: ${filepath}`)
|
||||
await deleteFileIfExists(filepath)
|
||||
} else if (!hasMoreBatches) {
|
||||
logger.info(`[RAG] Final batch complete, but file deletion was not requested`)
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: hasMoreBatches
|
||||
? 'ZIM batch processed successfully. More batches remain.'
|
||||
: 'ZIM file processed and embedded successfully with enhanced metadata.',
|
||||
chunks: totalChunks,
|
||||
hasMoreBatches,
|
||||
articlesProcessed: articlesInBatch,
|
||||
}
|
||||
}
|
||||
|
||||
private async processTextFile(fileBuffer: Buffer): Promise<string> {
|
||||
return await this.extractTXTText(fileBuffer)
|
||||
}
|
||||
|
||||
private async embedTextAndCleanup(
|
||||
extractedText: string,
|
||||
filepath: string,
|
||||
deleteAfterEmbedding: boolean = false
|
||||
): Promise<{ success: boolean; message: string; chunks?: number }> {
|
||||
if (!extractedText || extractedText.trim().length === 0) {
|
||||
return { success: false, message: 'Process completed succesfully, but no text was found to embed.' }
|
||||
}
|
||||
|
||||
const embedResult = await this.embedAndStoreText(extractedText, {
|
||||
source: filepath
|
||||
})
|
||||
|
||||
if (!embedResult) {
|
||||
return { success: false, message: 'Failed to embed and store the extracted text.' }
|
||||
}
|
||||
|
||||
if (deleteAfterEmbedding) {
|
||||
logger.info(`[RAG] Embedding complete, deleting uploaded file: ${filepath}`)
|
||||
await deleteFileIfExists(filepath)
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: 'File processed and embedded successfully.',
|
||||
chunks: embedResult.chunks,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main pipeline to process and embed an uploaded file into the RAG knowledge base.
|
||||
* This includes text extraction, chunking, embedding, and storing in Qdrant.
|
||||
*
|
||||
* Orchestrates file type detection and delegates to specialized processors.
|
||||
* For ZIM files, supports batch processing via batchOffset parameter.
|
||||
*/
|
||||
public async processAndEmbedFile(
|
||||
filepath: string, // Should already be the full path to the uploaded file
|
||||
deleteAfterEmbedding: boolean = false
|
||||
): Promise<{ success: boolean; message: string; chunks?: number }> {
|
||||
filepath: string,
|
||||
deleteAfterEmbedding: boolean = false,
|
||||
batchOffset?: number
|
||||
): Promise<{
|
||||
success: boolean
|
||||
message: string
|
||||
chunks?: number
|
||||
hasMoreBatches?: boolean
|
||||
articlesProcessed?: number
|
||||
totalArticles?: number
|
||||
}> {
|
||||
try {
|
||||
const fileType = determineFileType(filepath)
|
||||
logger.debug(`[RAG] Processing file: ${filepath} (detected type: ${fileType})`)
|
||||
|
||||
if (fileType === 'unknown') {
|
||||
return { success: false, message: 'Unsupported file type.' }
|
||||
}
|
||||
|
||||
const origFileBuffer = await getFile(filepath, 'buffer')
|
||||
if (!origFileBuffer) {
|
||||
// Read file buffer (not needed for ZIM as it reads directly)
|
||||
const fileBuffer = fileType !== 'zim' ? await getFile(filepath, 'buffer') : null
|
||||
if (fileType !== 'zim' && !fileBuffer) {
|
||||
return { success: false, message: 'Failed to read the uploaded file.' }
|
||||
}
|
||||
|
||||
let extractedText = ''
|
||||
|
||||
if (fileType === 'image') {
|
||||
const preprocessedBuffer = await this.preprocessImage(origFileBuffer)
|
||||
extractedText = await this.extractImageText(preprocessedBuffer)
|
||||
} else if (fileType === 'pdf') {
|
||||
extractedText = await this.extractPDFText(origFileBuffer)
|
||||
// Check if there was no extracted text or it was very minimal
|
||||
if (!extractedText || extractedText.trim().length < 100) {
|
||||
// Convert PDF pages to images for OCR
|
||||
const imageBuffers = await this.convertPDFtoImages(origFileBuffer)
|
||||
for (const imgBuffer of imageBuffers) {
|
||||
const preprocessedImg = await this.preprocessImage(imgBuffer)
|
||||
const pageText = await this.extractImageText(preprocessedImg)
|
||||
extractedText += pageText + '\n'
|
||||
}
|
||||
}
|
||||
} else {
|
||||
extractedText = await this.extractTXTText(origFileBuffer)
|
||||
// Process based on file type
|
||||
// ZIM files are handled specially since they have their own embedding workflow
|
||||
if (fileType === 'zim') {
|
||||
return await this.processZIMFile(filepath, deleteAfterEmbedding, batchOffset)
|
||||
}
|
||||
|
||||
if (!extractedText || extractedText.trim().length === 0) {
|
||||
return { success: false, message: 'No text could be extracted from the file.' }
|
||||
// Extract text based on file type
|
||||
let extractedText: string
|
||||
switch (fileType) {
|
||||
case 'image':
|
||||
extractedText = await this.processImageFile(fileBuffer!)
|
||||
break
|
||||
case 'pdf':
|
||||
extractedText = await this.processPDFFile(fileBuffer!)
|
||||
break
|
||||
case 'text':
|
||||
default:
|
||||
extractedText = await this.processTextFile(fileBuffer!)
|
||||
break
|
||||
}
|
||||
|
||||
const embedResult = await this.embedAndStoreText(extractedText, {
|
||||
source: filepath
|
||||
})
|
||||
|
||||
if (!embedResult) {
|
||||
return { success: false, message: 'Failed to embed and store the extracted text.' }
|
||||
}
|
||||
|
||||
if (deleteAfterEmbedding) {
|
||||
// Cleanup the file from disk
|
||||
logger.info(`[RAG] Embedding complete, deleting uploaded file: ${filepath}`)
|
||||
await deleteFileIfExists(filepath)
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
message: 'File processed and embedded successfully.',
|
||||
chunks: embedResult?.chunks,
|
||||
}
|
||||
// Embed extracted text and cleanup
|
||||
return await this.embedTextAndCleanup(extractedText, filepath, deleteAfterEmbedding)
|
||||
} catch (error) {
|
||||
logger.error('Error processing and embedding file:', error)
|
||||
logger.error('[RAG] Error processing and embedding file:', error)
|
||||
return { success: false, message: 'Error processing and embedding file.' }
|
||||
}
|
||||
}
|
||||
|
|
@ -497,6 +673,13 @@ export class RagService {
|
|||
keywords: (result.payload?.keywords as string) || '',
|
||||
chunk_index: (result.payload?.chunk_index as number) || 0,
|
||||
created_at: (result.payload?.created_at as number) || 0,
|
||||
// Enhanced ZIM metadata (likely be undefined for non-ZIM content)
|
||||
article_title: result.payload?.article_title as string | undefined,
|
||||
section_title: result.payload?.section_title as string | undefined,
|
||||
full_title: result.payload?.full_title as string | undefined,
|
||||
hierarchy: result.payload?.hierarchy as string | undefined,
|
||||
document_id: result.payload?.document_id as string | undefined,
|
||||
content_type: result.payload?.content_type as string | undefined,
|
||||
}))
|
||||
|
||||
const rerankedResults = this.rerankResults(resultsWithMetadata, keywords, query)
|
||||
|
|
@ -508,7 +691,7 @@ export class RagService {
|
|||
)
|
||||
})
|
||||
|
||||
// Return top N results
|
||||
// Return top N results with enhanced metadata
|
||||
return rerankedResults.slice(0, limit).map((result) => ({
|
||||
text: result.text,
|
||||
score: result.finalScore,
|
||||
|
|
@ -516,6 +699,13 @@ export class RagService {
|
|||
chunk_index: result.chunk_index,
|
||||
created_at: result.created_at,
|
||||
semantic_score: result.score,
|
||||
// Enhanced ZIM metadata (likely be undefined for non-ZIM content)
|
||||
article_title: result.article_title,
|
||||
section_title: result.section_title,
|
||||
full_title: result.full_title,
|
||||
hierarchy: result.hierarchy,
|
||||
document_id: result.document_id,
|
||||
content_type: result.content_type,
|
||||
},
|
||||
}))
|
||||
} catch (error) {
|
||||
|
|
@ -544,6 +734,12 @@ export class RagService {
|
|||
keywords: string
|
||||
chunk_index: number
|
||||
created_at: number
|
||||
article_title?: string
|
||||
section_title?: string
|
||||
full_title?: string
|
||||
hierarchy?: string
|
||||
document_id?: string
|
||||
content_type?: string
|
||||
}>,
|
||||
queryKeywords: string[],
|
||||
originalQuery: string
|
||||
|
|
@ -553,6 +749,12 @@ export class RagService {
|
|||
finalScore: number
|
||||
chunk_index: number
|
||||
created_at: number
|
||||
article_title?: string
|
||||
section_title?: string
|
||||
full_title?: string
|
||||
hierarchy?: string
|
||||
document_id?: string
|
||||
content_type?: string
|
||||
}> {
|
||||
return results
|
||||
.map((result) => {
|
||||
|
|
@ -711,11 +913,9 @@ export class RagService {
|
|||
for (const fileInfo of filesToEmbed) {
|
||||
try {
|
||||
logger.info(`[RAG] Dispatching embed job for: ${fileInfo.source}`)
|
||||
const stats = await getFileStatsIfExists(fileInfo.path)
|
||||
await EmbedFileJob.dispatch({
|
||||
filePath: fileInfo.path,
|
||||
fileName: fileInfo.source,
|
||||
fileSize: stats?.size,
|
||||
})
|
||||
logger.info(`[RAG] Successfully dispatched job for ${fileInfo.source}`)
|
||||
} catch (fileError) {
|
||||
|
|
|
|||
310
admin/app/services/zim_extraction_service.ts
Normal file
310
admin/app/services/zim_extraction_service.ts
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
import { Archive, Entry } from '@openzim/libzim'
|
||||
import * as cheerio from 'cheerio'
|
||||
import { HTML_SELECTORS_TO_REMOVE, NON_CONTENT_HEADING_PATTERNS } from '../../constants/zim_extraction.js'
|
||||
import logger from '@adonisjs/core/services/logger'
|
||||
import { ExtractZIMChunkingStrategy, ExtractZIMContentOptions, ZIMContentChunk, ZIMArchiveMetadata } from '../../types/zim.js'
|
||||
import { randomUUID } from 'node:crypto'
|
||||
import { access } from 'node:fs/promises'
|
||||
|
||||
export class ZIMExtractionService {
|
||||
|
||||
private extractArchiveMetadata(archive: Archive): ZIMArchiveMetadata {
|
||||
try {
|
||||
return {
|
||||
title: archive.getMetadata('Title') || archive.getMetadata('Name') || 'Unknown',
|
||||
creator: archive.getMetadata('Creator') || 'Unknown',
|
||||
publisher: archive.getMetadata('Publisher') || 'Unknown',
|
||||
date: archive.getMetadata('Date') || 'Unknown',
|
||||
language: archive.getMetadata('Language') || 'Unknown',
|
||||
description: archive.getMetadata('Description') || '',
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn('[ZIMExtractionService]: Could not extract all metadata, using defaults', error)
|
||||
return {
|
||||
title: 'Unknown',
|
||||
creator: 'Unknown',
|
||||
publisher: 'Unknown',
|
||||
date: 'Unknown',
|
||||
language: 'Unknown',
|
||||
description: '',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Breaks out a ZIM file's entries into their structured content form
|
||||
* to facilitate better indexing and retrieval.
|
||||
* Returns enhanced chunks with full article context and metadata.
|
||||
*
|
||||
* @param filePath - Path to the ZIM file
|
||||
* @param opts - Options including maxArticles, strategy, onProgress, startOffset, and batchSize
|
||||
*/
|
||||
async extractZIMContent(filePath: string, opts: ExtractZIMContentOptions = {}): Promise<ZIMContentChunk[]> {
|
||||
try {
|
||||
logger.info(`[ZIMExtractionService]: Processing ZIM file at path: ${filePath}`)
|
||||
|
||||
// defensive - check if file still exists before opening
|
||||
// could have been deleted by another process or batch
|
||||
try {
|
||||
await access(filePath)
|
||||
} catch (error) {
|
||||
logger.error(`[ZIMExtractionService]: ZIM file not accessible: ${filePath}`)
|
||||
throw new Error(`ZIM file not found or not accessible: ${filePath}`)
|
||||
}
|
||||
|
||||
const archive = new Archive(filePath)
|
||||
|
||||
// Extract archive-level metadata once
|
||||
const archiveMetadata = this.extractArchiveMetadata(archive)
|
||||
logger.info(`[ZIMExtractionService]: Archive metadata - Title: ${archiveMetadata.title}, Language: ${archiveMetadata.language}`)
|
||||
|
||||
let articlesProcessed = 0
|
||||
let articlesSkipped = 0
|
||||
const processedPaths = new Set<string>()
|
||||
const toReturn: ZIMContentChunk[] = []
|
||||
|
||||
// Support batch processing to avoid lock timeouts on large ZIM files
|
||||
const startOffset = opts.startOffset || 0
|
||||
const batchSize = opts.batchSize || (opts.maxArticles || Infinity)
|
||||
|
||||
for (const entry of archive.iterByPath()) {
|
||||
// Skip articles until we reach the start offset
|
||||
if (articlesSkipped < startOffset) {
|
||||
if (this.isArticleEntry(entry) && !processedPaths.has(entry.path)) {
|
||||
articlesSkipped++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if (articlesProcessed >= batchSize) {
|
||||
break
|
||||
}
|
||||
|
||||
if (!this.isArticleEntry(entry)) {
|
||||
logger.debug(`[ZIMExtractionService]: Skipping non-article entry at path: ${entry.path}`)
|
||||
continue
|
||||
}
|
||||
|
||||
if (processedPaths.has(entry.path)) {
|
||||
logger.debug(`[ZIMExtractionService]: Skipping duplicate entry at path: ${entry.path}`)
|
||||
continue
|
||||
}
|
||||
processedPaths.add(entry.path)
|
||||
|
||||
const item = entry.item
|
||||
const blob = item.data
|
||||
const html = this.getCleanedHTMLString(blob.data)
|
||||
|
||||
const strategy = opts.strategy || this.chooseChunkingStrategy(html);
|
||||
logger.debug(`[ZIMExtractionService]: Chosen chunking strategy for path ${entry.path}: ${strategy}`)
|
||||
|
||||
// Generate a unique document ID. All chunks from same article will share it
|
||||
const documentId = randomUUID()
|
||||
const articleTitle = entry.title || entry.path
|
||||
|
||||
let chunks: ZIMContentChunk[]
|
||||
|
||||
if (strategy === 'structured') {
|
||||
const structured = this.extractStructuredContent(html)
|
||||
chunks = structured.sections.map(s => ({
|
||||
text: s.text,
|
||||
articleTitle,
|
||||
articlePath: entry.path,
|
||||
sectionTitle: s.heading,
|
||||
fullTitle: `${articleTitle} - ${s.heading}`,
|
||||
hierarchy: `${articleTitle} > ${s.heading}`,
|
||||
sectionLevel: s.level,
|
||||
documentId,
|
||||
archiveMetadata,
|
||||
strategy,
|
||||
}))
|
||||
} else {
|
||||
// Simple strategy - entire article as one chunk
|
||||
const text = this.extractTextFromHTML(html) || ''
|
||||
chunks = [{
|
||||
text,
|
||||
articleTitle,
|
||||
articlePath: entry.path,
|
||||
sectionTitle: articleTitle, // Same as article for simple strategy
|
||||
fullTitle: articleTitle,
|
||||
hierarchy: articleTitle,
|
||||
documentId,
|
||||
archiveMetadata,
|
||||
strategy,
|
||||
}]
|
||||
}
|
||||
|
||||
logger.debug(`Extracted ${chunks.length} chunks from article at path: ${entry.path} using strategy: ${strategy}`)
|
||||
|
||||
const nonEmptyChunks = chunks.filter(c => c.text.trim().length > 0)
|
||||
logger.debug(`After filtering empty chunks, ${nonEmptyChunks.length} chunks remain for article at path: ${entry.path}`)
|
||||
toReturn.push(...nonEmptyChunks)
|
||||
articlesProcessed++
|
||||
|
||||
if (opts.onProgress) {
|
||||
opts.onProgress(articlesProcessed, archive.articleCount)
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(`[ZIMExtractionService]: Completed processing ZIM file. Total articles processed: ${articlesProcessed}`)
|
||||
logger.debug("Final structured content sample:", toReturn.slice(0, 3).map(c => ({
|
||||
articleTitle: c.articleTitle,
|
||||
sectionTitle: c.sectionTitle,
|
||||
hierarchy: c.hierarchy,
|
||||
textPreview: c.text.substring(0, 100)
|
||||
})))
|
||||
logger.debug("Total structured sections extracted:", toReturn.length)
|
||||
return toReturn
|
||||
} catch (error) {
|
||||
logger.error('Error processing ZIM file:', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
private chooseChunkingStrategy(html: string, options = {
|
||||
forceStrategy: null as ExtractZIMChunkingStrategy | null,
|
||||
}): ExtractZIMChunkingStrategy {
|
||||
const {
|
||||
forceStrategy = null,
|
||||
} = options;
|
||||
|
||||
if (forceStrategy) return forceStrategy;
|
||||
|
||||
// Use a simple analysis to determin if the HTML has any meaningful structure
|
||||
// that we can leverage for better chunking. If not, we'll just chunk it as one big piece of text.
|
||||
return this.hasStructuredHeadings(html) ? 'structured' : 'simple';
|
||||
}
|
||||
|
||||
private getCleanedHTMLString(buff: Buffer<ArrayBufferLike>): string {
|
||||
const rawString = buff.toString('utf-8');
|
||||
const $ = cheerio.load(rawString);
|
||||
|
||||
HTML_SELECTORS_TO_REMOVE.forEach((selector) => {
|
||||
$(selector).remove()
|
||||
});
|
||||
|
||||
return $.html();
|
||||
}
|
||||
|
||||
private extractTextFromHTML(html: string): string | null {
|
||||
try {
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
// Search body first, then root if body is absent
|
||||
const text = $('body').length ? $('body').text() : $.root().text()
|
||||
|
||||
return text.replace(/\s+/g, ' ').replace(/\n\s*\n/g, '\n').trim()
|
||||
} catch (error) {
|
||||
logger.error('Error extracting text from HTML:', error)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
private extractStructuredContent(html: string) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const title = $('h1').first().text().trim() || $('title').text().trim();
|
||||
|
||||
// Extract sections with their headings and heading levels
|
||||
const sections: Array<{ heading: string; text: string; level: number }> = [];
|
||||
let currentSection = { heading: 'Introduction', content: [] as string[], level: 2 };
|
||||
|
||||
$('body').children().each((_, element) => {
|
||||
const $el = $(element);
|
||||
const tagName = element.tagName?.toLowerCase();
|
||||
|
||||
if (['h2', 'h3', 'h4'].includes(tagName)) {
|
||||
// Save current section if it has content
|
||||
if (currentSection.content.length > 0) {
|
||||
sections.push({
|
||||
heading: currentSection.heading,
|
||||
text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
|
||||
level: currentSection.level,
|
||||
});
|
||||
}
|
||||
// Start new section
|
||||
const level = parseInt(tagName.substring(1)); // Extract number from h2, h3, h4
|
||||
currentSection = {
|
||||
heading: $el.text().replace(/\[edit\]/gi, '').trim(),
|
||||
content: [],
|
||||
level,
|
||||
};
|
||||
} else if (['p', 'ul', 'ol', 'dl', 'table'].includes(tagName)) {
|
||||
const text = $el.text().trim();
|
||||
if (text.length > 0) {
|
||||
currentSection.content.push(text);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Push the last section if it has content
|
||||
if (currentSection.content.length > 0) {
|
||||
sections.push({
|
||||
heading: currentSection.heading,
|
||||
text: currentSection.content.join(' ').replace(/\s+/g, ' ').trim(),
|
||||
level: currentSection.level,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
title,
|
||||
sections,
|
||||
fullText: sections.map(s => `${s.heading}\n${s.text}`).join('\n\n'),
|
||||
};
|
||||
}
|
||||
|
||||
private hasStructuredHeadings(html: string): boolean {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const headings = $('h2, h3').toArray();
|
||||
|
||||
// Consider it structured if it has at least 2 headings to break content into meaningful sections
|
||||
if (headings.length < 2) return false;
|
||||
|
||||
// Check that headings have substantial content between them
|
||||
let sectionsWithContent = 0;
|
||||
|
||||
for (const heading of headings) {
|
||||
const $heading = $(heading);
|
||||
const headingText = $heading.text().trim();
|
||||
|
||||
// Skip empty or very short headings, likely not meaningful
|
||||
if (headingText.length < 3) continue;
|
||||
|
||||
// Skip common non-content headings
|
||||
if (NON_CONTENT_HEADING_PATTERNS.some(pattern => pattern.test(headingText))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Content until next heading
|
||||
let contentLength = 0;
|
||||
let $next = $heading.next();
|
||||
|
||||
while ($next.length && !$next.is('h1, h2, h3, h4')) {
|
||||
contentLength += $next.text().trim().length;
|
||||
$next = $next.next();
|
||||
}
|
||||
|
||||
// Consider it a real section if it has at least 100 chars of content
|
||||
if (contentLength >= 100) {
|
||||
sectionsWithContent++;
|
||||
}
|
||||
}
|
||||
|
||||
// Require at least 2 sections with substantial content
|
||||
return sectionsWithContent >= 2;
|
||||
}
|
||||
|
||||
private isArticleEntry(entry: Entry): boolean {
|
||||
try {
|
||||
if (entry.isRedirect) return false;
|
||||
|
||||
const item = entry.item;
|
||||
const mimeType = item.mimetype;
|
||||
|
||||
return mimeType === 'text/html' || mimeType === 'application/xhtml+xml';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -43,7 +43,7 @@ interface IZimService {
|
|||
|
||||
@inject()
|
||||
export class ZimService implements IZimService {
|
||||
constructor(private dockerService: DockerService) {}
|
||||
constructor(private dockerService: DockerService) { }
|
||||
|
||||
async list() {
|
||||
const dirPath = join(process.cwd(), ZIM_STORAGE_PATH)
|
||||
|
|
@ -264,7 +264,7 @@ export class ZimService implements IZimService {
|
|||
}
|
||||
|
||||
return downloadFilenames.length > 0 ? downloadFilenames : null
|
||||
}
|
||||
}
|
||||
|
||||
async downloadRemoteSuccessCallback(urls: string[], restart = true) {
|
||||
// Check if any URL is a Wikipedia download and handle it
|
||||
|
|
@ -275,28 +275,28 @@ export class ZimService implements IZimService {
|
|||
}
|
||||
|
||||
if (restart) {
|
||||
// Check if there are any remaining ZIM download jobs before restarting
|
||||
// Check if there are any remaining ZIM download jobs before restarting
|
||||
const { QueueService } = await import('./queue_service.js')
|
||||
const queueService = new QueueService()
|
||||
const queue = queueService.getQueue('downloads')
|
||||
|
||||
|
||||
// Get all active and waiting jobs
|
||||
const [activeJobs, waitingJobs] = await Promise.all([
|
||||
queue.getActive(),
|
||||
queue.getWaiting(),
|
||||
])
|
||||
|
||||
|
||||
// Filter out completed jobs (progress === 100) to avoid race condition
|
||||
// where this job itself is still in the active queue
|
||||
const activeIncompleteJobs = activeJobs.filter((job) => {
|
||||
const progress = typeof job.progress === 'number' ? job.progress : 0
|
||||
return progress < 100
|
||||
})
|
||||
|
||||
|
||||
// Check if any remaining incomplete jobs are ZIM downloads
|
||||
const allJobs = [...activeIncompleteJobs, ...waitingJobs]
|
||||
const hasRemainingZimJobs = allJobs.some((job) => job.data.filetype === 'zim')
|
||||
|
||||
|
||||
if (hasRemainingZimJobs) {
|
||||
logger.info('[ZimService] Skipping container restart - more ZIM downloads pending')
|
||||
} else {
|
||||
|
|
@ -364,7 +364,7 @@ export class ZimService implements IZimService {
|
|||
// Check each tier from highest to lowest (assuming tiers are ordered from low to high)
|
||||
// We check in reverse to find the highest fully-installed tier
|
||||
const reversedTiers = [...category.tiers].reverse()
|
||||
|
||||
|
||||
for (const tier of reversedTiers) {
|
||||
const allResourcesInstalled = tier.resources.every((resource) => {
|
||||
// Check if resource is marked as downloaded in database
|
||||
|
|
@ -408,7 +408,7 @@ export class ZimService implements IZimService {
|
|||
|
||||
for (const collection of validated.collections) {
|
||||
const { resources, ...restCollection } = collection; // we'll handle resources separately
|
||||
|
||||
|
||||
// Upsert the collection itself
|
||||
await CuratedCollection.updateOrCreate(
|
||||
{ slug: restCollection.slug },
|
||||
|
|
@ -489,11 +489,11 @@ export class ZimService implements IZimService {
|
|||
options,
|
||||
currentSelection: selection
|
||||
? {
|
||||
optionId: selection.option_id,
|
||||
status: selection.status,
|
||||
filename: selection.filename,
|
||||
url: selection.url,
|
||||
}
|
||||
optionId: selection.option_id,
|
||||
status: selection.status,
|
||||
filename: selection.filename,
|
||||
url: selection.url,
|
||||
}
|
||||
: null,
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -152,7 +152,7 @@ export function matchesDevice(fsPath: string, deviceName: string): boolean {
|
|||
return false
|
||||
}
|
||||
|
||||
export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'unknown' {
|
||||
export function determineFileType(filename: string): 'image' | 'pdf' | 'text' | 'zim' | 'unknown' {
|
||||
const ext = path.extname(filename).toLowerCase()
|
||||
if (['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'].includes(ext)) {
|
||||
return 'image'
|
||||
|
|
@ -160,6 +160,8 @@ export function determineFileType(filename: string): 'image' | 'pdf' | 'text' |
|
|||
return 'pdf'
|
||||
} else if (['.txt', '.md', '.docx', '.rtf'].includes(ext)) {
|
||||
return 'text'
|
||||
} else if (ext === '.zim') {
|
||||
return 'zim'
|
||||
} else {
|
||||
return 'unknown'
|
||||
}
|
||||
|
|
|
|||
48
admin/constants/zim_extraction.ts
Normal file
48
admin/constants/zim_extraction.ts
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
|
||||
export const HTML_SELECTORS_TO_REMOVE = [
|
||||
'script',
|
||||
'style',
|
||||
'nav',
|
||||
'header',
|
||||
'footer',
|
||||
'noscript',
|
||||
'iframe',
|
||||
'svg',
|
||||
'.navbox',
|
||||
'.sidebar',
|
||||
'.infobox',
|
||||
'.mw-editsection',
|
||||
'.reference',
|
||||
'.reflist',
|
||||
'.toc',
|
||||
'.noprint',
|
||||
'.mw-jump-link',
|
||||
'.mw-headline-anchor',
|
||||
'[role="navigation"]',
|
||||
'.navbar',
|
||||
'.hatnote',
|
||||
'.ambox',
|
||||
'.sistersitebox',
|
||||
'.portal',
|
||||
'#coordinates',
|
||||
'.geo-nondefault',
|
||||
'.authority-control',
|
||||
]
|
||||
|
||||
// Common heading names that usually don't have meaningful content under them
|
||||
export const NON_CONTENT_HEADING_PATTERNS = [
|
||||
/^see also$/i,
|
||||
/^references$/i,
|
||||
/^external links$/i,
|
||||
/^further reading$/i,
|
||||
/^notes$/i,
|
||||
/^bibliography$/i,
|
||||
/^navigation$/i,
|
||||
]
|
||||
|
||||
/**
|
||||
* Batch size for processing ZIM articles to prevent lock timeout errors.
|
||||
* Processing 50 articles at a time balances throughput with job duration.
|
||||
* Typical processing time: 2-5 minutes per batch depending on article complexity.
|
||||
*/
|
||||
export const ZIM_BATCH_SIZE = 50
|
||||
1501
admin/package-lock.json
generated
1501
admin/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
|
|
@ -76,6 +76,7 @@
|
|||
"@headlessui/react": "^2.2.4",
|
||||
"@inertiajs/react": "^2.0.13",
|
||||
"@markdoc/markdoc": "^0.5.2",
|
||||
"@openzim/libzim": "^4.0.0",
|
||||
"@protomaps/basemaps": "^5.7.0",
|
||||
"@qdrant/js-client-rest": "^1.16.2",
|
||||
"@tabler/icons-react": "^3.34.0",
|
||||
|
|
@ -92,6 +93,7 @@
|
|||
"axios": "^1.13.1",
|
||||
"better-sqlite3": "^12.1.1",
|
||||
"bullmq": "^5.65.1",
|
||||
"cheerio": "^1.2.0",
|
||||
"dockerode": "^4.0.7",
|
||||
"edge.js": "^6.2.1",
|
||||
"fast-xml-parser": "^5.2.5",
|
||||
|
|
|
|||
|
|
@ -64,3 +64,47 @@ export type RemoteZimFileEntry = {
|
|||
author: string
|
||||
file_name: string
|
||||
}
|
||||
|
||||
export type ExtractZIMContentOptions = {
|
||||
strategy?: ExtractZIMChunkingStrategy
|
||||
maxArticles?: number
|
||||
onProgress?: (processedArticles: number, totalArticles: number) => void
|
||||
// Batch processing options to avoid lock timeouts
|
||||
startOffset?: number // Article index to start from for resuming
|
||||
batchSize?: number // Max articles to process in this batch
|
||||
}
|
||||
|
||||
export type ExtractZIMChunkingStrategy = 'structured' | 'simple'
|
||||
|
||||
export type ZIMArchiveMetadata = {
|
||||
title: string
|
||||
creator: string
|
||||
publisher: string
|
||||
date: string
|
||||
language: string
|
||||
description: string
|
||||
}
|
||||
|
||||
export type ZIMContentChunk = {
|
||||
// Content
|
||||
text: string
|
||||
|
||||
// Article-level context
|
||||
articleTitle: string
|
||||
articlePath: string
|
||||
|
||||
// Section-level context for structured chunks
|
||||
sectionTitle: string
|
||||
fullTitle: string // Combined "Article Title - Section Title"
|
||||
hierarchy: string // Breadcrumb trail
|
||||
sectionLevel?: number // Heading level (2=h2, 3=h3, etc.)
|
||||
|
||||
// Document grouping
|
||||
documentId: string // Same for all chunks from one article
|
||||
|
||||
// Archive metadata
|
||||
archiveMetadata: ZIMArchiveMetadata
|
||||
|
||||
// Extraction metadata
|
||||
strategy: ExtractZIMChunkingStrategy
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user