import { QdrantClient } from '@qdrant/js-client-rest' import { DockerService } from './docker_service.js' import { inject } from '@adonisjs/core' import logger from '@adonisjs/core/services/logger' import { chunk } from 'llm-chunk' import sharp from 'sharp' import { determineFileType, getFile } from '../utils/fs.js' import { PDFParse } from 'pdf-parse' import { createWorker } from 'tesseract.js' import { fromBuffer } from 'pdf2pic' import { OllamaService } from './ollama_service.js' @inject() export class RagService { private qdrant: QdrantClient | null = null private qdrantInitPromise: Promise | null = null public static CONTENT_COLLECTION_NAME = 'open-webui_knowledge' // This is the collection name OWUI uses for uploaded knowledge public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5' public static EMBEDDING_DIMENSION = 768 // Nomic Embed Text v1.5 dimension is 768 constructor( private dockerService: DockerService, private ollamaService: OllamaService ) {} private async _initializeQdrantClient() { if (!this.qdrantInitPromise) { this.qdrantInitPromise = (async () => { const qdrantUrl = await this.dockerService.getServiceURL(DockerService.QDRANT_SERVICE_NAME) if (!qdrantUrl) { throw new Error('Qdrant service is not installed or running.') } this.qdrant = new QdrantClient({ url: qdrantUrl }) })() } return this.qdrantInitPromise } private async _ensureDependencies() { if (!this.qdrant) { await this._initializeQdrantClient() } } private async _ensureCollection( collectionName: string, dimensions: number = RagService.EMBEDDING_DIMENSION ) { try { await this._ensureDependencies() const collections = await this.qdrant!.getCollections() const collectionExists = collections.collections.some((col) => col.name === collectionName) if (!collectionExists) { await this.qdrant!.createCollection(collectionName, { vectors: { size: dimensions, distance: 'Cosine', }, }) } } catch (error) { logger.error('Error ensuring Qdrant collection:', error) throw error } } public async embedAndStoreText( text: string, metadata: Record = {} ): Promise<{ chunks: number } | null> { try { await this._ensureCollection( RagService.CONTENT_COLLECTION_NAME, RagService.EMBEDDING_DIMENSION ) const allModels = await this.ollamaService.getModels(true) const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL) // TODO: Attempt to download the embedding model if not found if (!embeddingModel) { throw new Error(`${RagService.EMBEDDING_MODEL} does not exist and could not be downloaded.`) } const chunks = chunk(text, { // These settings should provide a good balance between context and precision minLength: 512, maxLength: 1024, overlap: 200, }) if (!chunks || chunks.length === 0) { throw new Error('No text chunks generated for embedding.') } const embeddings: number[][] = [] const ollamaClient = await this.ollamaService.getClient() for (const chunkText of chunks) { const response = await ollamaClient.embeddings({ model: RagService.EMBEDDING_MODEL, prompt: chunkText, }) embeddings.push(response.embedding) } const points = chunks.map((chunkText, index) => ({ id: `${Date.now()}_${index}`, vector: embeddings[index], payload: { ...metadata, text: chunkText, chunk_index: index, }, })) await this.qdrant!.upsert(RagService.CONTENT_COLLECTION_NAME, { points }) return { chunks: chunks.length } } catch (error) { logger.error('Error embedding text:', error) return null } } /** * Preprocess an image to enhance text extraction quality. * Normalizes, grayscales, sharpens, and resizes the image to a manageable size. * @param filebuffer Buffer of the image file * @returns - Processed image buffer */ private async preprocessImage(filebuffer: Buffer): Promise { return await sharp(filebuffer) .grayscale() .normalize() .sharpen() .resize({ width: 2000, fit: 'inside' }) .toBuffer() } /** * If the original PDF has little to no extractable text, * we can use this method to convert each page to an image for OCR processing. * @param filebuffer - Buffer of the PDF file * @returns - Array of image buffers, one per page */ private async convertPDFtoImages(filebuffer: Buffer): Promise { const converted = await fromBuffer(filebuffer, { quality: 50, density: 200, format: 'png', }).bulk(-1, { responseType: 'buffer', }) return converted.filter((res) => res.buffer).map((res) => res.buffer!) } /** * Extract text from a PDF file using pdf-parse. * @param filebuffer - Buffer of the PDF file * @returns - Extracted text */ private async extractPDFText(filebuffer: Buffer): Promise { const parser = new PDFParse({ data: filebuffer }) const data = await parser.getText() await parser.destroy() return data.text } /** * Extract text from a plain text file. * @param filebuffer - Buffer of the text file * @returns - Extracted text */ private async extractTXTText(filebuffer: Buffer): Promise { return filebuffer.toString('utf-8') } /** * Extract text from an image file using Tesseract.js OCR. * @param filebuffer - Buffer of the image file * @returns - Extracted text */ private async extractImageText(filebuffer: Buffer): Promise { const worker = await createWorker('eng') const result = await worker.recognize(filebuffer) await worker.terminate() return result.data.text } /** * Main pipeline to process and embed an uploaded file into the RAG knowledge base. * This includes text extraction, chunking, embedding, and storing in Qdrant. */ public async processAndEmbedFile( filepath: string ): Promise<{ success: boolean; message: string; chunks?: number }> { try { const fileType = determineFileType(filepath) if (fileType === 'unknown') { return { success: false, message: 'Unsupported file type.' } } const origFileBuffer = await getFile(filepath, 'buffer') if (!origFileBuffer) { return { success: false, message: 'Failed to read the uploaded file.' } } let extractedText = '' if (fileType === 'image') { const preprocessedBuffer = await this.preprocessImage(origFileBuffer) extractedText = await this.extractImageText(preprocessedBuffer) } else if (fileType === 'pdf') { extractedText = await this.extractPDFText(origFileBuffer) // Check if there was no extracted text or it was very minimal if (!extractedText || extractedText.trim().length < 100) { // Convert PDF pages to images for OCR const imageBuffers = await this.convertPDFtoImages(origFileBuffer) for (const imgBuffer of imageBuffers) { const preprocessedImg = await this.preprocessImage(imgBuffer) const pageText = await this.extractImageText(preprocessedImg) extractedText += pageText + '\n' } } } else { extractedText = await this.extractTXTText(origFileBuffer) } if (!extractedText || extractedText.trim().length === 0) { return { success: false, message: 'No text could be extracted from the file.' } } const embedResult = await this.embedAndStoreText(extractedText, {}) return { success: true, message: 'File processed and embedded successfully.', chunks: embedResult?.chunks, } } catch (error) { logger.error('Error processing and embedding file:', error) return { success: false, message: 'Error processing and embedding file.' } } } /** * Search for documents similar to the query text in the Qdrant knowledge base. * Returns the most relevant text chunks based on semantic similarity. * @param query - The search query text * @param limit - Maximum number of results to return (default: 5) * @param scoreThreshold - Minimum similarity score threshold (default: 0.7) * @returns Array of relevant text chunks with their scores */ public async searchSimilarDocuments( query: string, limit: number = 5, scoreThreshold: number = 0.7 ): Promise> { try { await this._ensureCollection( RagService.CONTENT_COLLECTION_NAME, RagService.EMBEDDING_DIMENSION ) const allModels = await this.ollamaService.getModels(true) const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL) if (!embeddingModel) { logger.warn( `${RagService.EMBEDDING_MODEL} not found. Cannot perform similarity search.` ) return [] } // Generate embedding for the query const ollamaClient = await this.ollamaService.getClient() const response = await ollamaClient.embeddings({ model: RagService.EMBEDDING_MODEL, prompt: query, }) // Search for similar vectors in Qdrant const searchResults = await this.qdrant!.search(RagService.CONTENT_COLLECTION_NAME, { vector: response.embedding, limit: limit, score_threshold: scoreThreshold, with_payload: true, }) console.log("Got search results:", searchResults); return searchResults.map((result) => ({ text: (result.payload?.text as string) || '', score: result.score, })) } catch (error) { logger.error('Error searching similar documents:', error) return [] } } /** * Retrieve all unique source files that have been stored in the knowledge base. * @returns Array of unique source file identifiers */ public async getStoredFiles(): Promise { try { await this._ensureCollection( RagService.CONTENT_COLLECTION_NAME, RagService.EMBEDDING_DIMENSION ) const sources = new Set() let offset: string | number | null | Record = null const batchSize = 100 // Scroll through all points in the collection do { const scrollResult = await this.qdrant!.scroll(RagService.CONTENT_COLLECTION_NAME, { limit: batchSize, offset: offset, with_payload: true, with_vector: false, }) // Extract unique source values from payloads scrollResult.points.forEach((point) => { const metadata = point.payload?.metadata if (metadata && typeof metadata === 'object' && 'source' in metadata) { const source = metadata.source as string sources.add(source) } }) offset = scrollResult.next_page_offset || null } while (offset !== null) return Array.from(sources) } catch (error) { logger.error('Error retrieving stored files:', error) return [] } } }