mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-05-30 16:16:50 +02:00
feat(KB): ratio registry for disk + time estimates (Phase 1B of RFC #883)
Foundation for the cost estimates and partial-stall detection that Phase 2 will surface. No consumers yet — this PR just lays the table, the seed rows, and the lookup helper so subsequent UI work has estimates available without a per-ZIM benchmark. ## What lands - New table `kb_ratio_registry` (pattern, chunks_per_mb, sample_count, notes). Migration creates and seeds heuristic defaults from the RFC appendix: devdocs (1100/MB), Wikipedia variants (270/MB), iFixit (50/MB), Stack Exchange Q&A (200/MB), video-only ZIMs (0), plus a catch-all fallback at 100/MB. - `KbRatioRegistry` model with static `lookup()` and `estimateChunks()`. - Pure helper `kb_ratio_lookup.ts` doing longest-prefix-match — a specific entry (`wikipedia_en_simple_`) overrides a broader one (`wikipedia_en_`). 9 unit tests covering the lookup boundary. - `sample_count` starts at 0 (heuristic seed) and is reserved for Phase 4 self-calibration to increment as observed ZIMs update each row. ## Not in scope - Self-calibration on successful ingestion (Phase 4) - UI consumers — Warning B (partial-embed stall) and the storage budget meter / time estimates land in Phase 2. ## Tested - Type-check clean - 9 unit tests pass for `findChunksPerMb` and `estimateChunkCount` - Migration applied on NOMAD3 via hot-patch; 9 seed rows verified in DB
This commit is contained in:
parent
8ce5790ab5
commit
68e1bd5ff2
51
admin/app/models/kb_ratio_registry.ts
Normal file
51
admin/app/models/kb_ratio_registry.ts
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import { DateTime } from 'luxon'
|
||||
import { BaseModel, column, SnakeCaseNamingStrategy } from '@adonisjs/lucid/orm'
|
||||
import { findChunksPerMb, estimateChunkCount } from '../utils/kb_ratio_lookup.js'
|
||||
|
||||
/**
|
||||
* Self-calibrating registry of `{filename-prefix → chunks_per_mb}` ratios used
|
||||
* for disk-footprint and time-to-embed estimates surfaced in the KB panel.
|
||||
*
|
||||
* Migration seeds the registry with heuristic defaults from the RFC #883
|
||||
* appendix; Phase 4 self-calibration will update rows in place as ZIMs finish
|
||||
* ingesting and the real ratio becomes known. Lookup is longest-prefix-match
|
||||
* (see `kb_ratio_lookup.ts`) so a specific entry (`wikipedia_en_simple_`)
|
||||
* overrides a broader one (`wikipedia_en_`).
|
||||
*/
|
||||
export default class KbRatioRegistry extends BaseModel {
|
||||
static table = 'kb_ratio_registry'
|
||||
static namingStrategy = new SnakeCaseNamingStrategy()
|
||||
|
||||
@column({ isPrimary: true })
|
||||
declare id: number
|
||||
|
||||
@column()
|
||||
declare pattern: string
|
||||
|
||||
@column()
|
||||
declare chunks_per_mb: number
|
||||
|
||||
@column()
|
||||
declare sample_count: number
|
||||
|
||||
@column()
|
||||
declare notes: string | null
|
||||
|
||||
@column.dateTime({ autoCreate: true })
|
||||
declare created_at: DateTime
|
||||
|
||||
@column.dateTime({ autoCreate: true, autoUpdate: true })
|
||||
declare updated_at: DateTime
|
||||
|
||||
/** Look up chunks_per_mb for a filename by longest-prefix match. */
|
||||
static async lookup(filename: string): Promise<number | null> {
|
||||
const rows = await this.all()
|
||||
return findChunksPerMb(filename, rows)
|
||||
}
|
||||
|
||||
/** Estimate total chunks for a file of the given size on disk. */
|
||||
static async estimateChunks(filename: string, fileSizeBytes: number): Promise<number | null> {
|
||||
const rows = await this.all()
|
||||
return estimateChunkCount(filename, fileSizeBytes, rows)
|
||||
}
|
||||
}
|
||||
44
admin/app/utils/kb_ratio_lookup.ts
Normal file
44
admin/app/utils/kb_ratio_lookup.ts
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
export interface RatioRow {
|
||||
pattern: string
|
||||
chunks_per_mb: number
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the chunks_per_mb estimate for a filename by longest-prefix match.
|
||||
*
|
||||
* Patterns are filename prefixes (`devdocs_`, `wikipedia_en_simple_`, ...).
|
||||
* The longest matching prefix wins, so a specific entry (`wikipedia_en_simple_`)
|
||||
* overrides the broader fallback (`wikipedia_en_`). An empty-string pattern in
|
||||
* the registry serves as a catch-all that matches every input.
|
||||
*
|
||||
* Returns `null` if no row matches and no empty-string fallback is present —
|
||||
* caller decides whether to surface "unknown" or use its own default.
|
||||
*/
|
||||
export function findChunksPerMb(filename: string, rows: RatioRow[]): number | null {
|
||||
let best: RatioRow | null = null
|
||||
for (const row of rows) {
|
||||
if (!filename.startsWith(row.pattern)) continue
|
||||
if (best === null || row.pattern.length > best.pattern.length) {
|
||||
best = row
|
||||
}
|
||||
}
|
||||
return best === null ? null : best.chunks_per_mb
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the number of embedding chunks a ZIM-style file will produce given
|
||||
* its size on disk in bytes. Returns `null` when the registry has nothing to
|
||||
* match against. Caller is responsible for converting the estimate into either
|
||||
* a disk-footprint estimate (chunks × bytes-per-chunk in Qdrant) or a time
|
||||
* estimate (chunks ÷ chunks-per-minute-on-this-hardware).
|
||||
*/
|
||||
export function estimateChunkCount(
|
||||
filename: string,
|
||||
fileSizeBytes: number,
|
||||
rows: RatioRow[]
|
||||
): number | null {
|
||||
const ratio = findChunksPerMb(filename, rows)
|
||||
if (ratio === null) return null
|
||||
const megabytes = fileSizeBytes / (1024 * 1024)
|
||||
return Math.round(ratio * megabytes)
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
import { BaseSchema } from '@adonisjs/lucid/schema'
|
||||
import { DateTime } from 'luxon'
|
||||
|
||||
const SEED_ROWS: Array<{ pattern: string; chunks_per_mb: number; notes: string }> = [
|
||||
// Dense technical reference — every paragraph carries content
|
||||
{ pattern: 'devdocs_', chunks_per_mb: 1100, notes: 'Heuristic seed: dense API references' },
|
||||
// Encyclopedia prose — Simple English & general Wikipedia variants
|
||||
{
|
||||
pattern: 'wikipedia_en_simple_',
|
||||
chunks_per_mb: 270,
|
||||
notes: 'Heuristic seed: Simple English Wikipedia',
|
||||
},
|
||||
{
|
||||
pattern: 'wikipedia_en_',
|
||||
chunks_per_mb: 270,
|
||||
notes: 'Heuristic seed: general Wikipedia variants',
|
||||
},
|
||||
// Sparse text, image-heavy
|
||||
{ pattern: 'ifixit_', chunks_per_mb: 50, notes: 'Heuristic seed: image-heavy repair guides' },
|
||||
// Q&A pages — moderate density, mostly short answers
|
||||
{
|
||||
pattern: 'cooking.stackexchange.com_',
|
||||
chunks_per_mb: 200,
|
||||
notes: 'Heuristic seed: Stack Exchange Q&A',
|
||||
},
|
||||
// Video-only ZIMs produce zero text chunks. Listing these explicitly keeps
|
||||
// the cost estimator from spinning up "indexing in progress" UI for content
|
||||
// that has no embeddable text whatsoever.
|
||||
{ pattern: 'lrnselfreliance_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
|
||||
{ pattern: 'ted_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
|
||||
{ pattern: 'freedom-of-religion_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
|
||||
// Empty-pattern fallback — every filename startsWith('') is true. The lookup
|
||||
// picks the longest matching pattern, so this only fires for ZIMs that match
|
||||
// none of the above (medium prose density).
|
||||
{ pattern: '', chunks_per_mb: 100, notes: 'Heuristic fallback' },
|
||||
]
|
||||
|
||||
export default class extends BaseSchema {
|
||||
protected tableName = 'kb_ratio_registry'
|
||||
|
||||
async up() {
|
||||
this.schema.createTable(this.tableName, (table) => {
|
||||
table.increments('id').primary()
|
||||
table.string('pattern', 255).notNullable().unique()
|
||||
table.decimal('chunks_per_mb', 10, 2).notNullable()
|
||||
// 0 = heuristic seed, >0 = number of observed ZIMs that have updated this entry.
|
||||
// Phase 4 self-calibration increments this on each successful ingestion.
|
||||
table.integer('sample_count').notNullable().defaultTo(0)
|
||||
table.text('notes').nullable()
|
||||
table.timestamp('created_at').notNullable()
|
||||
table.timestamp('updated_at').notNullable()
|
||||
})
|
||||
|
||||
const now = DateTime.utc().toSQL({ includeOffset: false }) as string
|
||||
const rows = SEED_ROWS.map((row) => ({ ...row, created_at: now, updated_at: now }))
|
||||
this.defer(async (db) => {
|
||||
await db.table(this.tableName).multiInsert(rows)
|
||||
})
|
||||
}
|
||||
|
||||
async down() {
|
||||
this.schema.dropTable(this.tableName)
|
||||
}
|
||||
}
|
||||
62
admin/tests/unit/kb_ratio_lookup.spec.ts
Normal file
62
admin/tests/unit/kb_ratio_lookup.spec.ts
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
import * as assert from 'node:assert/strict'
|
||||
import { test } from 'node:test'
|
||||
|
||||
import { estimateChunkCount, findChunksPerMb } from '../../app/utils/kb_ratio_lookup.js'
|
||||
|
||||
const SEEDED_ROWS = [
|
||||
{ pattern: 'devdocs_', chunks_per_mb: 1100 },
|
||||
{ pattern: 'wikipedia_en_simple_', chunks_per_mb: 270 },
|
||||
{ pattern: 'wikipedia_en_', chunks_per_mb: 250 },
|
||||
{ pattern: 'ifixit_', chunks_per_mb: 50 },
|
||||
{ pattern: 'lrnselfreliance_', chunks_per_mb: 0 },
|
||||
{ pattern: '', chunks_per_mb: 100 },
|
||||
]
|
||||
|
||||
test('exact prefix match', () => {
|
||||
assert.equal(findChunksPerMb('devdocs_en_python_2026-02.zim', SEEDED_ROWS), 1100)
|
||||
})
|
||||
|
||||
test('longest-prefix wins over broader sibling', () => {
|
||||
// wikipedia_en_simple_* should pick 270, not the 250 from wikipedia_en_
|
||||
assert.equal(
|
||||
findChunksPerMb('wikipedia_en_simple_all_nopic_2026-02.zim', SEEDED_ROWS),
|
||||
270
|
||||
)
|
||||
})
|
||||
|
||||
test('broader prefix used when no specific match', () => {
|
||||
// wikipedia_en_medicine_* is not seeded; falls through to wikipedia_en_ at 250
|
||||
assert.equal(findChunksPerMb('wikipedia_en_medicine_nopic_2026-04.zim', SEEDED_ROWS), 250)
|
||||
})
|
||||
|
||||
test('empty-string fallback catches unmatched filenames', () => {
|
||||
assert.equal(findChunksPerMb('something_unknown_2026-02.zim', SEEDED_ROWS), 100)
|
||||
})
|
||||
|
||||
test('returns null when no row matches and no fallback is registered', () => {
|
||||
const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '')
|
||||
assert.equal(findChunksPerMb('something_unknown_2026-02.zim', rowsWithoutFallback), null)
|
||||
})
|
||||
|
||||
test('zero-ratio entry returns 0, not null (video-only ZIMs)', () => {
|
||||
assert.equal(findChunksPerMb('lrnselfreliance_en_all_2025-12.zim', SEEDED_ROWS), 0)
|
||||
})
|
||||
|
||||
test('estimateChunkCount scales by file size in MB', () => {
|
||||
// 100 MB * 1100 chunks/MB ≈ 110,000 chunks for devdocs
|
||||
const bytes = 100 * 1024 * 1024
|
||||
assert.equal(estimateChunkCount('devdocs_en_python_2026-02.zim', bytes, SEEDED_ROWS), 110000)
|
||||
})
|
||||
|
||||
test('estimateChunkCount returns 0 for video-only ZIM regardless of size', () => {
|
||||
const bytes = 5 * 1024 * 1024 * 1024 // 5 GB
|
||||
assert.equal(estimateChunkCount('lrnselfreliance_en_all_2025-12.zim', bytes, SEEDED_ROWS), 0)
|
||||
})
|
||||
|
||||
test('estimateChunkCount returns null when no match and no fallback', () => {
|
||||
const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '')
|
||||
assert.equal(
|
||||
estimateChunkCount('something_unknown_2026-02.zim', 50 * 1024 * 1024, rowsWithoutFallback),
|
||||
null
|
||||
)
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user