import { Ollama } from 'ollama' import { QdrantClient } from '@qdrant/js-client-rest' import { DockerService } from './docker_service.js' import { inject } from '@adonisjs/core' import logger from '@adonisjs/core/services/logger' import { chunk } from 'llm-chunk' import { OpenWebUIService } from './openwebui_service.js' import sharp from 'sharp' import { determineFileType, getFile } from '../utils/fs.js' import { PDFParse } from 'pdf-parse' import { createWorker } from 'tesseract.js' import { fromBuffer } from 'pdf2pic' @inject() export class RagService { private qdrant: QdrantClient | null = null private ollama: Ollama | null = null private qdrantInitPromise: Promise | null = null private ollamaInitPromise: Promise | null = null public static CONTENT_COLLECTION_NAME = 'open-webui_knowledge' // This is the collection name OWUI uses for uploaded knowledge public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5' public static EMBEDDING_DIMENSION = 768 // Nomic Embed Text v1.5 dimension is 768 constructor( private dockerService: DockerService, private openWebUIService: OpenWebUIService ) {} private async _initializeQdrantClient() { if (!this.qdrantInitPromise) { this.qdrantInitPromise = (async () => { const qdrantUrl = await this.dockerService.getServiceURL(DockerService.QDRANT_SERVICE_NAME) if (!qdrantUrl) { throw new Error('Qdrant service is not installed or running.') } this.qdrant = new QdrantClient({ url: `http://${qdrantUrl}` }) })() } return this.qdrantInitPromise } private async _initializeOllamaClient() { if (!this.ollamaInitPromise) { this.ollamaInitPromise = (async () => { const ollamaUrl = await this.dockerService.getServiceURL(DockerService.OLLAMA_SERVICE_NAME) if (!ollamaUrl) { throw new Error('Ollama service is not installed or running.') } this.ollama = new Ollama({ host: `http://${ollamaUrl}` }) })() } return this.ollamaInitPromise } private async _ensureDependencies() { if (!this.qdrant) { await this._initializeQdrantClient() } if (!this.ollama) { await this._initializeOllamaClient() } } private async _ensureCollection( collectionName: string, dimensions: number = RagService.EMBEDDING_DIMENSION ) { try { await this._ensureDependencies() const collections = await this.qdrant!.getCollections() const collectionExists = collections.collections.some((col) => col.name === collectionName) if (!collectionExists) { await this.qdrant!.createCollection(collectionName, { vectors: { size: dimensions, distance: 'Cosine', }, }) } } catch (error) { logger.error('Error ensuring Qdrant collection:', error) throw error } } public async embedAndStoreText( text: string, metadata: Record = {} ): Promise<{ chunks: number } | null> { try { await this._ensureCollection( RagService.CONTENT_COLLECTION_NAME, RagService.EMBEDDING_DIMENSION ) const initModelResponse = await this.openWebUIService.downloadModelSync( RagService.EMBEDDING_MODEL ) if (!initModelResponse.success) { throw new Error( `${RagService.EMBEDDING_MODEL} does not exist and could not be downloaded: ${initModelResponse.message}` ) } const chunks = chunk(text, { // These settings should provide a good balance between context and precision minLength: 512, maxLength: 1024, overlap: 200, }) if (!chunks || chunks.length === 0) { throw new Error('No text chunks generated for embedding.') } const embeddings: number[][] = [] for (const chunkText of chunks) { const response = await this.ollama!.embeddings({ model: RagService.EMBEDDING_MODEL, prompt: chunkText, }) embeddings.push(response.embedding) } const points = chunks.map((chunkText, index) => ({ id: `${Date.now()}_${index}`, vector: embeddings[index], payload: { ...metadata, text: chunkText, chunk_index: index, }, })) await this.qdrant!.upsert(RagService.CONTENT_COLLECTION_NAME, { points }) return { chunks: chunks.length } } catch (error) { logger.error('Error embedding text:', error) return null } } /** * Preprocess an image to enhance text extraction quality. * Normalizes, grayscales, sharpens, and resizes the image to a manageable size. * @param filebuffer Buffer of the image file * @returns - Processed image buffer */ private async preprocessImage(filebuffer: Buffer): Promise { return await sharp(filebuffer) .grayscale() .normalize() .sharpen() .resize({ width: 2000, fit: 'inside' }) .toBuffer() } /** * If the original PDF has little to no extractable text, * we can use this method to convert each page to an image for OCR processing. * @param filebuffer - Buffer of the PDF file * @returns - Array of image buffers, one per page */ private async convertPDFtoImages(filebuffer: Buffer): Promise { const converted = await fromBuffer(filebuffer, { quality: 50, density: 200, format: 'png', }).bulk(-1, { responseType: 'buffer', }) return converted.filter((res) => res.buffer).map((res) => res.buffer!) } /** * Extract text from a PDF file using pdf-parse. * @param filebuffer - Buffer of the PDF file * @returns - Extracted text */ private async extractPDFText(filebuffer: Buffer): Promise { const parser = new PDFParse({ data: filebuffer }) const data = await parser.getText() await parser.destroy() return data.text } /** * Extract text from a plain text file. * @param filebuffer - Buffer of the text file * @returns - Extracted text */ private async extractTXTText(filebuffer: Buffer): Promise { return filebuffer.toString('utf-8') } /** * Extract text from an image file using Tesseract.js OCR. * @param filebuffer - Buffer of the image file * @returns - Extracted text */ private async extractImageText(filebuffer: Buffer): Promise { const worker = await createWorker('eng') const result = await worker.recognize(filebuffer) await worker.terminate() return result.data.text } /** * Main pipeline to process and embed an uploaded file into the RAG knowledge base. * This includes text extraction, chunking, embedding, and storing in Qdrant. */ public async processAndEmbedFile( filepath: string ): Promise<{ success: boolean; message: string }> { try { const fileType = determineFileType(filepath) if (fileType === 'unknown') { return { success: false, message: 'Unsupported file type.' } } const origFileBuffer = await getFile(filepath, 'buffer') if (!origFileBuffer) { return { success: false, message: 'Failed to read the uploaded file.' } } let extractedText = '' if (fileType === 'image') { const preprocessedBuffer = await this.preprocessImage(origFileBuffer) extractedText = await this.extractImageText(preprocessedBuffer) } else if (fileType === 'pdf') { extractedText = await this.extractPDFText(origFileBuffer) // Check if there was no extracted text or it was very minimal if (!extractedText || extractedText.trim().length < 100) { // Convert PDF pages to images for OCR const imageBuffers = await this.convertPDFtoImages(origFileBuffer) for (const imgBuffer of imageBuffers) { const preprocessedImg = await this.preprocessImage(imgBuffer) const pageText = await this.extractImageText(preprocessedImg) extractedText += pageText + '\n' } } } else { extractedText = await this.extractTXTText(origFileBuffer) } if (!extractedText || extractedText.trim().length === 0) { return { success: false, message: 'No text could be extracted from the file.' } } const embedResult = await this.embedAndStoreText(extractedText, {}) return { success: true, message: 'File processed and embedded successfully.' } } catch (error) { logger.error('Error processing and embedding file:', error) return { success: false, message: 'Error processing and embedding file.' } } } }