project-nomad/admin/app/services/rag_service.ts
2026-01-31 20:39:49 -08:00

263 lines
8.5 KiB
TypeScript

import { Ollama } from 'ollama'
import { QdrantClient } from '@qdrant/js-client-rest'
import { DockerService } from './docker_service.js'
import { inject } from '@adonisjs/core'
import logger from '@adonisjs/core/services/logger'
import { chunk } from 'llm-chunk'
import { OpenWebUIService } from './openwebui_service.js'
import sharp from 'sharp'
import { determineFileType, getFile } from '../utils/fs.js'
import { PDFParse } from 'pdf-parse'
import { createWorker } from 'tesseract.js'
import { fromBuffer } from 'pdf2pic'
@inject()
export class RagService {
private qdrant: QdrantClient | null = null
private ollama: Ollama | null = null
private qdrantInitPromise: Promise<void> | null = null
private ollamaInitPromise: Promise<void> | null = null
public static CONTENT_COLLECTION_NAME = 'open-webui_knowledge' // This is the collection name OWUI uses for uploaded knowledge
public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5'
public static EMBEDDING_DIMENSION = 768 // Nomic Embed Text v1.5 dimension is 768
constructor(
private dockerService: DockerService,
private openWebUIService: OpenWebUIService
) {}
private async _initializeQdrantClient() {
if (!this.qdrantInitPromise) {
this.qdrantInitPromise = (async () => {
const qdrantUrl = await this.dockerService.getServiceURL(DockerService.QDRANT_SERVICE_NAME)
if (!qdrantUrl) {
throw new Error('Qdrant service is not installed or running.')
}
this.qdrant = new QdrantClient({ url: `http://${qdrantUrl}` })
})()
}
return this.qdrantInitPromise
}
private async _initializeOllamaClient() {
if (!this.ollamaInitPromise) {
this.ollamaInitPromise = (async () => {
const ollamaUrl = await this.dockerService.getServiceURL(DockerService.OLLAMA_SERVICE_NAME)
if (!ollamaUrl) {
throw new Error('Ollama service is not installed or running.')
}
this.ollama = new Ollama({ host: `http://${ollamaUrl}` })
})()
}
return this.ollamaInitPromise
}
private async _ensureDependencies() {
if (!this.qdrant) {
await this._initializeQdrantClient()
}
if (!this.ollama) {
await this._initializeOllamaClient()
}
}
private async _ensureCollection(
collectionName: string,
dimensions: number = RagService.EMBEDDING_DIMENSION
) {
try {
await this._ensureDependencies()
const collections = await this.qdrant!.getCollections()
const collectionExists = collections.collections.some((col) => col.name === collectionName)
if (!collectionExists) {
await this.qdrant!.createCollection(collectionName, {
vectors: {
size: dimensions,
distance: 'Cosine',
},
})
}
} catch (error) {
logger.error('Error ensuring Qdrant collection:', error)
throw error
}
}
public async embedAndStoreText(
text: string,
metadata: Record<string, any> = {}
): Promise<{ chunks: number } | null> {
try {
await this._ensureCollection(
RagService.CONTENT_COLLECTION_NAME,
RagService.EMBEDDING_DIMENSION
)
const initModelResponse = await this.openWebUIService.downloadModelSync(
RagService.EMBEDDING_MODEL
)
if (!initModelResponse.success) {
throw new Error(
`${RagService.EMBEDDING_MODEL} does not exist and could not be downloaded: ${initModelResponse.message}`
)
}
const chunks = chunk(text, {
// These settings should provide a good balance between context and precision
minLength: 512,
maxLength: 1024,
overlap: 200,
})
if (!chunks || chunks.length === 0) {
throw new Error('No text chunks generated for embedding.')
}
const embeddings: number[][] = []
for (const chunkText of chunks) {
const response = await this.ollama!.embeddings({
model: RagService.EMBEDDING_MODEL,
prompt: chunkText,
})
embeddings.push(response.embedding)
}
const points = chunks.map((chunkText, index) => ({
id: `${Date.now()}_${index}`,
vector: embeddings[index],
payload: {
...metadata,
text: chunkText,
chunk_index: index,
},
}))
await this.qdrant!.upsert(RagService.CONTENT_COLLECTION_NAME, { points })
return { chunks: chunks.length }
} catch (error) {
logger.error('Error embedding text:', error)
return null
}
}
/**
* Preprocess an image to enhance text extraction quality.
* Normalizes, grayscales, sharpens, and resizes the image to a manageable size.
* @param filebuffer Buffer of the image file
* @returns - Processed image buffer
*/
private async preprocessImage(filebuffer: Buffer): Promise<Buffer> {
return await sharp(filebuffer)
.grayscale()
.normalize()
.sharpen()
.resize({ width: 2000, fit: 'inside' })
.toBuffer()
}
/**
* If the original PDF has little to no extractable text,
* we can use this method to convert each page to an image for OCR processing.
* @param filebuffer - Buffer of the PDF file
* @returns - Array of image buffers, one per page
*/
private async convertPDFtoImages(filebuffer: Buffer): Promise<Buffer[]> {
const converted = await fromBuffer(filebuffer, {
quality: 50,
density: 200,
format: 'png',
}).bulk(-1, {
responseType: 'buffer',
})
return converted.filter((res) => res.buffer).map((res) => res.buffer!)
}
/**
* Extract text from a PDF file using pdf-parse.
* @param filebuffer - Buffer of the PDF file
* @returns - Extracted text
*/
private async extractPDFText(filebuffer: Buffer): Promise<string> {
const parser = new PDFParse({ data: filebuffer })
const data = await parser.getText()
await parser.destroy()
return data.text
}
/**
* Extract text from a plain text file.
* @param filebuffer - Buffer of the text file
* @returns - Extracted text
*/
private async extractTXTText(filebuffer: Buffer): Promise<string> {
return filebuffer.toString('utf-8')
}
/**
* Extract text from an image file using Tesseract.js OCR.
* @param filebuffer - Buffer of the image file
* @returns - Extracted text
*/
private async extractImageText(filebuffer: Buffer): Promise<string> {
const worker = await createWorker('eng')
const result = await worker.recognize(filebuffer)
await worker.terminate()
return result.data.text
}
/**
* Main pipeline to process and embed an uploaded file into the RAG knowledge base.
* This includes text extraction, chunking, embedding, and storing in Qdrant.
*/
public async processAndEmbedFile(
filepath: string
): Promise<{ success: boolean; message: string }> {
try {
const fileType = determineFileType(filepath)
if (fileType === 'unknown') {
return { success: false, message: 'Unsupported file type.' }
}
const origFileBuffer = await getFile(filepath, 'buffer')
if (!origFileBuffer) {
return { success: false, message: 'Failed to read the uploaded file.' }
}
let extractedText = ''
if (fileType === 'image') {
const preprocessedBuffer = await this.preprocessImage(origFileBuffer)
extractedText = await this.extractImageText(preprocessedBuffer)
} else if (fileType === 'pdf') {
extractedText = await this.extractPDFText(origFileBuffer)
// Check if there was no extracted text or it was very minimal
if (!extractedText || extractedText.trim().length < 100) {
// Convert PDF pages to images for OCR
const imageBuffers = await this.convertPDFtoImages(origFileBuffer)
for (const imgBuffer of imageBuffers) {
const preprocessedImg = await this.preprocessImage(imgBuffer)
const pageText = await this.extractImageText(preprocessedImg)
extractedText += pageText + '\n'
}
}
} else {
extractedText = await this.extractTXTText(origFileBuffer)
}
if (!extractedText || extractedText.trim().length === 0) {
return { success: false, message: 'No text could be extracted from the file.' }
}
const embedResult = await this.embedAndStoreText(extractedText, {})
return { success: true, message: 'File processed and embedded successfully.' }
} catch (error) {
logger.error('Error processing and embedding file:', error)
return { success: false, message: 'Error processing and embedding file.' }
}
}
}