mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-04-01 06:29:26 +02:00
- Test GPU detection logic with mocked exec calls - Test service installation guard and race condition prevention - Test container command splitting with quoted arguments - Test sanitizeFilename utility function - Test file type validation and error handling https://claude.ai/code/session_01JFvpTYgm8GiE4vJ4cJKsFx
455 lines
16 KiB
TypeScript
455 lines
16 KiB
TypeScript
import { test } from '@japa/runner'
|
||
import { RagService } from '#services/rag_service'
|
||
import { sanitizeFilename, determineFileType } from '../../../app/utils/fs.js'
|
||
|
||
/**
|
||
* Unit tests for RagService and related RAG utilities.
|
||
*
|
||
* These tests exercise pure logic (sanitisation, file type detection,
|
||
* text processing) without requiring Qdrant, Ollama, or Docker.
|
||
*/
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Helper: build a RagService with stubbed dependencies
|
||
// ---------------------------------------------------------------------------
|
||
function buildService(): RagService {
|
||
const svc = Object.create(RagService.prototype) as RagService
|
||
// Null out external clients so tests that call private helpers don't
|
||
// accidentally hit real services.
|
||
;(svc as any).qdrant = null
|
||
;(svc as any).qdrantInitPromise = null
|
||
;(svc as any).embeddingModelVerified = false
|
||
;(svc as any).dockerService = {
|
||
getServiceURL: async () => null,
|
||
}
|
||
;(svc as any).ollamaService = {
|
||
getModels: async () => [],
|
||
getClient: async () => ({}),
|
||
}
|
||
return svc
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// sanitizeFilename (exported utility)
|
||
// ---------------------------------------------------------------------------
|
||
test.group('sanitizeFilename', () => {
|
||
test('keeps alphanumeric, dots, hyphens, and underscores', ({ assert }) => {
|
||
assert.equal(sanitizeFilename('my-file_v2.txt'), 'my-file_v2.txt')
|
||
})
|
||
|
||
test('replaces spaces with underscores', ({ assert }) => {
|
||
assert.equal(sanitizeFilename('my file name.pdf'), 'my_file_name.pdf')
|
||
})
|
||
|
||
test('replaces special characters', ({ assert }) => {
|
||
assert.equal(sanitizeFilename('résumé (1).doc'), 'r_sum___1_.doc')
|
||
})
|
||
|
||
test('handles empty string', ({ assert }) => {
|
||
assert.equal(sanitizeFilename(''), '')
|
||
})
|
||
|
||
test('replaces path traversal characters', ({ assert }) => {
|
||
const result = sanitizeFilename('../../etc/passwd')
|
||
assert.isFalse(result.includes('/'))
|
||
assert.isFalse(result.includes('..'))
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// determineFileType (exported utility)
|
||
// ---------------------------------------------------------------------------
|
||
test.group('determineFileType', () => {
|
||
test('detects image extensions', ({ assert }) => {
|
||
for (const ext of ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']) {
|
||
assert.equal(determineFileType(`photo${ext}`), 'image')
|
||
}
|
||
})
|
||
|
||
test('detects PDF', ({ assert }) => {
|
||
assert.equal(determineFileType('document.pdf'), 'pdf')
|
||
})
|
||
|
||
test('detects text-like files', ({ assert }) => {
|
||
for (const ext of ['.txt', '.md', '.docx', '.rtf']) {
|
||
assert.equal(determineFileType(`notes${ext}`), 'text')
|
||
}
|
||
})
|
||
|
||
test('detects ZIM files', ({ assert }) => {
|
||
assert.equal(determineFileType('wikipedia.zim'), 'zim')
|
||
})
|
||
|
||
test('returns unknown for unrecognised extension', ({ assert }) => {
|
||
assert.equal(determineFileType('archive.tar.gz'), 'unknown')
|
||
assert.equal(determineFileType('binary.exe'), 'unknown')
|
||
})
|
||
|
||
test('is case-insensitive', ({ assert }) => {
|
||
assert.equal(determineFileType('PHOTO.JPG'), 'image')
|
||
assert.equal(determineFileType('DOC.PDF'), 'pdf')
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – sanitizeText (private)
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – sanitizeText', () => {
|
||
test('removes null bytes', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText('hello\x00world')
|
||
assert.equal(result, 'helloworld')
|
||
})
|
||
|
||
test('removes control characters but preserves newlines and tabs', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText('line1\nline2\ttab\x01gone')
|
||
assert.equal(result, 'line1\nline2\ttab') // trimmed, \x01 removed
|
||
})
|
||
|
||
test('trims whitespace', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText(' hello ')
|
||
assert.equal(result, 'hello')
|
||
})
|
||
|
||
test('handles empty string', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText('')
|
||
assert.equal(result, '')
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – estimateTokenCount (private)
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – estimateTokenCount', () => {
|
||
test('estimates tokens at ~3 chars per token', ({ assert }) => {
|
||
const svc = buildService()
|
||
// 9 chars -> ceil(9/3) = 3
|
||
assert.equal((svc as any).estimateTokenCount('123456789'), 3)
|
||
})
|
||
|
||
test('rounds up fractional token counts', ({ assert }) => {
|
||
const svc = buildService()
|
||
// 10 chars -> ceil(10/3) = 4
|
||
assert.equal((svc as any).estimateTokenCount('1234567890'), 4)
|
||
})
|
||
|
||
test('returns 0 for empty string', ({ assert }) => {
|
||
const svc = buildService()
|
||
assert.equal((svc as any).estimateTokenCount(''), 0)
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – truncateToTokenLimit (private)
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – truncateToTokenLimit', () => {
|
||
test('returns text unchanged when within limit', ({ assert }) => {
|
||
const svc = buildService()
|
||
const text = 'short text'
|
||
const result = (svc as any).truncateToTokenLimit(text, 100)
|
||
assert.equal(result, text)
|
||
})
|
||
|
||
test('truncates long text at word boundary', ({ assert }) => {
|
||
const svc = buildService()
|
||
// maxTokens = 3 means max ~9 chars. "hello wor" -> should truncate to "hello"
|
||
const text = 'hello world, this is a long sentence'
|
||
const result = (svc as any).truncateToTokenLimit(text, 3)
|
||
assert.isBelow(result.length, text.length)
|
||
// Should not end mid-word
|
||
assert.isFalse(result.endsWith('worl'))
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – preprocessQuery (private)
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – preprocessQuery', () => {
|
||
test('expands known abbreviations', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).preprocessQuery('bob essentials')
|
||
assert.include(result, 'bug out bag')
|
||
})
|
||
|
||
test('preserves original query when no abbreviations found', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).preprocessQuery('water purification')
|
||
assert.equal(result, 'water purification')
|
||
})
|
||
|
||
test('handles multiple abbreviations', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).preprocessQuery('edc bob')
|
||
assert.include(result, 'every day carry')
|
||
assert.include(result, 'bug out bag')
|
||
})
|
||
|
||
test('trims whitespace', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).preprocessQuery(' hello ')
|
||
assert.equal(result, 'hello')
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – extractKeywords (private)
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – extractKeywords', () => {
|
||
test('removes stopwords and short tokens', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).extractKeywords('how to purify water in the wild')
|
||
// "how", "to", "in", "the" are stopwords or short; "purify", "water", "wild" remain
|
||
assert.include(result, 'purify')
|
||
assert.include(result, 'water')
|
||
assert.include(result, 'wild')
|
||
assert.notInclude(result, 'how')
|
||
assert.notInclude(result, 'the')
|
||
})
|
||
|
||
test('returns unique keywords', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).extractKeywords('water water everywhere')
|
||
const waterCount = result.filter((w: string) => w === 'water').length
|
||
assert.equal(waterCount, 1)
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – _ensureDependencies error handling
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – dependency initialization', () => {
|
||
test('throws when Qdrant URL cannot be resolved', async ({ assert }) => {
|
||
const svc = buildService()
|
||
// dockerService.getServiceURL returns null -> should throw
|
||
await assert.rejects(
|
||
() => (svc as any)._ensureDependencies(),
|
||
/Qdrant service is not installed or running/
|
||
)
|
||
})
|
||
|
||
test('caches initialization promise (singleton pattern)', async ({ assert }) => {
|
||
const svc = buildService()
|
||
let callCount = 0
|
||
;(svc as any).dockerService = {
|
||
getServiceURL: async () => {
|
||
callCount++
|
||
return 'http://localhost:6333'
|
||
},
|
||
}
|
||
|
||
// Call twice; should only invoke getServiceURL once
|
||
await (svc as any)._initializeQdrantClient()
|
||
await (svc as any)._initializeQdrantClient()
|
||
assert.equal(callCount, 1)
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – static configuration constants
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – static constants', () => {
|
||
test('EMBEDDING_DIMENSION is 768', ({ assert }) => {
|
||
assert.equal(RagService.EMBEDDING_DIMENSION, 768)
|
||
})
|
||
|
||
test('EMBEDDING_MODEL is nomic-embed-text:v1.5', ({ assert }) => {
|
||
assert.equal(RagService.EMBEDDING_MODEL, 'nomic-embed-text:v1.5')
|
||
})
|
||
|
||
test('SEARCH_DOCUMENT_PREFIX is set', ({ assert }) => {
|
||
assert.isTrue(RagService.SEARCH_DOCUMENT_PREFIX.length > 0)
|
||
})
|
||
|
||
test('SEARCH_QUERY_PREFIX is set', ({ assert }) => {
|
||
assert.isTrue(RagService.SEARCH_QUERY_PREFIX.length > 0)
|
||
})
|
||
|
||
test('MAX_SAFE_TOKENS is less than MODEL_CONTEXT_LENGTH', ({ assert }) => {
|
||
assert.isBelow(RagService.MAX_SAFE_TOKENS, RagService.MODEL_CONTEXT_LENGTH)
|
||
})
|
||
|
||
test('TARGET_TOKENS_PER_CHUNK is less than MAX_SAFE_TOKENS', ({ assert }) => {
|
||
assert.isBelow(RagService.TARGET_TOKENS_PER_CHUNK, RagService.MAX_SAFE_TOKENS)
|
||
})
|
||
|
||
test('UPLOADS_STORAGE_PATH is set', ({ assert }) => {
|
||
assert.isTrue(RagService.UPLOADS_STORAGE_PATH.length > 0)
|
||
})
|
||
|
||
test('CONTENT_COLLECTION_NAME is set', ({ assert }) => {
|
||
assert.equal(RagService.CONTENT_COLLECTION_NAME, 'nomad_knowledge_base')
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// sanitizeFilename – additional edge cases
|
||
// ---------------------------------------------------------------------------
|
||
test.group('sanitizeFilename – additional cases', () => {
|
||
test('replaces unicode characters', ({ assert }) => {
|
||
const result = sanitizeFilename('文件名.txt')
|
||
assert.isFalse(/[^\x00-\x7F]/.test(result))
|
||
assert.isTrue(result.endsWith('.txt'))
|
||
})
|
||
|
||
test('preserves dots in multi-dot filenames', ({ assert }) => {
|
||
assert.equal(sanitizeFilename('file.backup.tar.gz'), 'file.backup.tar.gz')
|
||
})
|
||
|
||
test('handles very long filenames', ({ assert }) => {
|
||
const longName = 'a'.repeat(255) + '.txt'
|
||
const result = sanitizeFilename(longName)
|
||
assert.equal(result, longName) // all valid chars, should be unchanged
|
||
})
|
||
|
||
test('replaces backslashes', ({ assert }) => {
|
||
const result = sanitizeFilename('path\\to\\file.txt')
|
||
assert.isFalse(result.includes('\\'))
|
||
})
|
||
|
||
test('replaces colons', ({ assert }) => {
|
||
const result = sanitizeFilename('file:name.txt')
|
||
assert.isFalse(result.includes(':'))
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// determineFileType – additional edge cases
|
||
// ---------------------------------------------------------------------------
|
||
test.group('determineFileType – additional cases', () => {
|
||
test('handles files with no extension', ({ assert }) => {
|
||
assert.equal(determineFileType('README'), 'unknown')
|
||
})
|
||
|
||
test('handles files with only a dot', ({ assert }) => {
|
||
assert.equal(determineFileType('.hidden'), 'unknown')
|
||
})
|
||
|
||
test('handles mixed-case extensions', ({ assert }) => {
|
||
assert.equal(determineFileType('photo.PNG'), 'image')
|
||
assert.equal(determineFileType('doc.Pdf'), 'pdf')
|
||
assert.equal(determineFileType('notes.TXT'), 'text')
|
||
assert.equal(determineFileType('wiki.ZIM'), 'zim')
|
||
})
|
||
|
||
test('detects TIFF images', ({ assert }) => {
|
||
assert.equal(determineFileType('scan.tiff'), 'image')
|
||
})
|
||
|
||
test('detects WEBP images', ({ assert }) => {
|
||
assert.equal(determineFileType('photo.webp'), 'image')
|
||
})
|
||
|
||
test('detects markdown as text', ({ assert }) => {
|
||
assert.equal(determineFileType('readme.md'), 'text')
|
||
})
|
||
|
||
test('detects docx as text', ({ assert }) => {
|
||
assert.equal(determineFileType('document.docx'), 'text')
|
||
})
|
||
|
||
test('detects rtf as text', ({ assert }) => {
|
||
assert.equal(determineFileType('letter.rtf'), 'text')
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – sanitizeText additional edge cases
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – sanitizeText edge cases', () => {
|
||
test('removes invalid Unicode surrogates', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText('hello\uD800world')
|
||
assert.equal(result, 'helloworld')
|
||
})
|
||
|
||
test('preserves carriage returns', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText('line1\r\nline2')
|
||
assert.include(result, '\r\n')
|
||
})
|
||
|
||
test('handles text with only control characters', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText('\x01\x02\x03')
|
||
assert.equal(result, '')
|
||
})
|
||
|
||
test('handles mixed valid and invalid content', ({ assert }) => {
|
||
const svc = buildService()
|
||
const result = (svc as any).sanitizeText('valid\x00text\x01here')
|
||
assert.equal(result, 'validtexthere')
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – _ensureDependencies additional error handling
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – error handling', () => {
|
||
test('_initializeQdrantClient throws when docker service returns null URL', async ({ assert }) => {
|
||
const svc = buildService()
|
||
;(svc as any).dockerService = {
|
||
getServiceURL: async () => null,
|
||
}
|
||
// Reset promise so it re-initializes
|
||
;(svc as any).qdrantInitPromise = null
|
||
|
||
await assert.rejects(
|
||
() => (svc as any)._initializeQdrantClient(),
|
||
/Qdrant service is not installed or running/
|
||
)
|
||
})
|
||
|
||
test('_ensureDependencies calls _initializeQdrantClient when qdrant is null', async ({ assert }) => {
|
||
const svc = buildService()
|
||
let initCalled = false
|
||
;(svc as any).qdrant = null
|
||
;(svc as any).qdrantInitPromise = null
|
||
;(svc as any).dockerService = {
|
||
getServiceURL: async () => {
|
||
initCalled = true
|
||
return 'http://localhost:6333'
|
||
},
|
||
}
|
||
|
||
await (svc as any)._ensureDependencies()
|
||
assert.isTrue(initCalled)
|
||
assert.isNotNull((svc as any).qdrant)
|
||
})
|
||
|
||
test('_ensureDependencies skips init when qdrant already set', async ({ assert }) => {
|
||
const svc = buildService()
|
||
const fakeClient = { fake: true }
|
||
;(svc as any).qdrant = fakeClient
|
||
let initCalled = false
|
||
;(svc as any)._initializeQdrantClient = async () => { initCalled = true }
|
||
|
||
await (svc as any)._ensureDependencies()
|
||
assert.isFalse(initCalled)
|
||
})
|
||
})
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// RagService – QUERY_EXPANSION_DICTIONARY
|
||
// ---------------------------------------------------------------------------
|
||
test.group('RagService – query expansion dictionary', () => {
|
||
test('dictionary contains common preparedness abbreviations', ({ assert }) => {
|
||
const dict = (RagService as any).QUERY_EXPANSION_DICTIONARY
|
||
assert.property(dict, 'bob')
|
||
assert.property(dict, 'edc')
|
||
assert.property(dict, 'shtf')
|
||
assert.property(dict, 'emp')
|
||
assert.property(dict, 'ifak')
|
||
})
|
||
|
||
test('all dictionary values are non-empty strings', ({ assert }) => {
|
||
const dict = (RagService as any).QUERY_EXPANSION_DICTIONARY
|
||
for (const [key, value] of Object.entries(dict)) {
|
||
assert.isString(value, `Value for '${key}' should be a string`)
|
||
assert.isAbove((value as string).length, 0, `Value for '${key}' should not be empty`)
|
||
}
|
||
})
|
||
})
|