From 68e1bd5ff257e0c3b8ad0d3a15fefe93090ef475 Mon Sep 17 00:00:00 2001 From: Chris Sherwood Date: Thu, 14 May 2026 12:15:46 -0700 Subject: [PATCH] feat(KB): ratio registry for disk + time estimates (Phase 1B of RFC #883) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foundation for the cost estimates and partial-stall detection that Phase 2 will surface. No consumers yet — this PR just lays the table, the seed rows, and the lookup helper so subsequent UI work has estimates available without a per-ZIM benchmark. ## What lands - New table `kb_ratio_registry` (pattern, chunks_per_mb, sample_count, notes). Migration creates and seeds heuristic defaults from the RFC appendix: devdocs (1100/MB), Wikipedia variants (270/MB), iFixit (50/MB), Stack Exchange Q&A (200/MB), video-only ZIMs (0), plus a catch-all fallback at 100/MB. - `KbRatioRegistry` model with static `lookup()` and `estimateChunks()`. - Pure helper `kb_ratio_lookup.ts` doing longest-prefix-match — a specific entry (`wikipedia_en_simple_`) overrides a broader one (`wikipedia_en_`). 9 unit tests covering the lookup boundary. - `sample_count` starts at 0 (heuristic seed) and is reserved for Phase 4 self-calibration to increment as observed ZIMs update each row. ## Not in scope - Self-calibration on successful ingestion (Phase 4) - UI consumers — Warning B (partial-embed stall) and the storage budget meter / time estimates land in Phase 2. ## Tested - Type-check clean - 9 unit tests pass for `findChunksPerMb` and `estimateChunkCount` - Migration applied on NOMAD3 via hot-patch; 9 seed rows verified in DB --- admin/app/models/kb_ratio_registry.ts | 51 +++++++++++++++ admin/app/utils/kb_ratio_lookup.ts | 44 +++++++++++++ ...00000001_create_kb_ratio_registry_table.ts | 64 +++++++++++++++++++ admin/tests/unit/kb_ratio_lookup.spec.ts | 62 ++++++++++++++++++ 4 files changed, 221 insertions(+) create mode 100644 admin/app/models/kb_ratio_registry.ts create mode 100644 admin/app/utils/kb_ratio_lookup.ts create mode 100644 admin/database/migrations/1776100000001_create_kb_ratio_registry_table.ts create mode 100644 admin/tests/unit/kb_ratio_lookup.spec.ts diff --git a/admin/app/models/kb_ratio_registry.ts b/admin/app/models/kb_ratio_registry.ts new file mode 100644 index 0000000..97cd1f2 --- /dev/null +++ b/admin/app/models/kb_ratio_registry.ts @@ -0,0 +1,51 @@ +import { DateTime } from 'luxon' +import { BaseModel, column, SnakeCaseNamingStrategy } from '@adonisjs/lucid/orm' +import { findChunksPerMb, estimateChunkCount } from '../utils/kb_ratio_lookup.js' + +/** + * Self-calibrating registry of `{filename-prefix → chunks_per_mb}` ratios used + * for disk-footprint and time-to-embed estimates surfaced in the KB panel. + * + * Migration seeds the registry with heuristic defaults from the RFC #883 + * appendix; Phase 4 self-calibration will update rows in place as ZIMs finish + * ingesting and the real ratio becomes known. Lookup is longest-prefix-match + * (see `kb_ratio_lookup.ts`) so a specific entry (`wikipedia_en_simple_`) + * overrides a broader one (`wikipedia_en_`). + */ +export default class KbRatioRegistry extends BaseModel { + static table = 'kb_ratio_registry' + static namingStrategy = new SnakeCaseNamingStrategy() + + @column({ isPrimary: true }) + declare id: number + + @column() + declare pattern: string + + @column() + declare chunks_per_mb: number + + @column() + declare sample_count: number + + @column() + declare notes: string | null + + @column.dateTime({ autoCreate: true }) + declare created_at: DateTime + + @column.dateTime({ autoCreate: true, autoUpdate: true }) + declare updated_at: DateTime + + /** Look up chunks_per_mb for a filename by longest-prefix match. */ + static async lookup(filename: string): Promise { + const rows = await this.all() + return findChunksPerMb(filename, rows) + } + + /** Estimate total chunks for a file of the given size on disk. */ + static async estimateChunks(filename: string, fileSizeBytes: number): Promise { + const rows = await this.all() + return estimateChunkCount(filename, fileSizeBytes, rows) + } +} diff --git a/admin/app/utils/kb_ratio_lookup.ts b/admin/app/utils/kb_ratio_lookup.ts new file mode 100644 index 0000000..19b22b6 --- /dev/null +++ b/admin/app/utils/kb_ratio_lookup.ts @@ -0,0 +1,44 @@ +export interface RatioRow { + pattern: string + chunks_per_mb: number +} + +/** + * Pick the chunks_per_mb estimate for a filename by longest-prefix match. + * + * Patterns are filename prefixes (`devdocs_`, `wikipedia_en_simple_`, ...). + * The longest matching prefix wins, so a specific entry (`wikipedia_en_simple_`) + * overrides the broader fallback (`wikipedia_en_`). An empty-string pattern in + * the registry serves as a catch-all that matches every input. + * + * Returns `null` if no row matches and no empty-string fallback is present — + * caller decides whether to surface "unknown" or use its own default. + */ +export function findChunksPerMb(filename: string, rows: RatioRow[]): number | null { + let best: RatioRow | null = null + for (const row of rows) { + if (!filename.startsWith(row.pattern)) continue + if (best === null || row.pattern.length > best.pattern.length) { + best = row + } + } + return best === null ? null : best.chunks_per_mb +} + +/** + * Estimate the number of embedding chunks a ZIM-style file will produce given + * its size on disk in bytes. Returns `null` when the registry has nothing to + * match against. Caller is responsible for converting the estimate into either + * a disk-footprint estimate (chunks × bytes-per-chunk in Qdrant) or a time + * estimate (chunks ÷ chunks-per-minute-on-this-hardware). + */ +export function estimateChunkCount( + filename: string, + fileSizeBytes: number, + rows: RatioRow[] +): number | null { + const ratio = findChunksPerMb(filename, rows) + if (ratio === null) return null + const megabytes = fileSizeBytes / (1024 * 1024) + return Math.round(ratio * megabytes) +} diff --git a/admin/database/migrations/1776100000001_create_kb_ratio_registry_table.ts b/admin/database/migrations/1776100000001_create_kb_ratio_registry_table.ts new file mode 100644 index 0000000..fb0e38c --- /dev/null +++ b/admin/database/migrations/1776100000001_create_kb_ratio_registry_table.ts @@ -0,0 +1,64 @@ +import { BaseSchema } from '@adonisjs/lucid/schema' +import { DateTime } from 'luxon' + +const SEED_ROWS: Array<{ pattern: string; chunks_per_mb: number; notes: string }> = [ + // Dense technical reference — every paragraph carries content + { pattern: 'devdocs_', chunks_per_mb: 1100, notes: 'Heuristic seed: dense API references' }, + // Encyclopedia prose — Simple English & general Wikipedia variants + { + pattern: 'wikipedia_en_simple_', + chunks_per_mb: 270, + notes: 'Heuristic seed: Simple English Wikipedia', + }, + { + pattern: 'wikipedia_en_', + chunks_per_mb: 270, + notes: 'Heuristic seed: general Wikipedia variants', + }, + // Sparse text, image-heavy + { pattern: 'ifixit_', chunks_per_mb: 50, notes: 'Heuristic seed: image-heavy repair guides' }, + // Q&A pages — moderate density, mostly short answers + { + pattern: 'cooking.stackexchange.com_', + chunks_per_mb: 200, + notes: 'Heuristic seed: Stack Exchange Q&A', + }, + // Video-only ZIMs produce zero text chunks. Listing these explicitly keeps + // the cost estimator from spinning up "indexing in progress" UI for content + // that has no embeddable text whatsoever. + { pattern: 'lrnselfreliance_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' }, + { pattern: 'ted_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' }, + { pattern: 'freedom-of-religion_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' }, + // Empty-pattern fallback — every filename startsWith('') is true. The lookup + // picks the longest matching pattern, so this only fires for ZIMs that match + // none of the above (medium prose density). + { pattern: '', chunks_per_mb: 100, notes: 'Heuristic fallback' }, +] + +export default class extends BaseSchema { + protected tableName = 'kb_ratio_registry' + + async up() { + this.schema.createTable(this.tableName, (table) => { + table.increments('id').primary() + table.string('pattern', 255).notNullable().unique() + table.decimal('chunks_per_mb', 10, 2).notNullable() + // 0 = heuristic seed, >0 = number of observed ZIMs that have updated this entry. + // Phase 4 self-calibration increments this on each successful ingestion. + table.integer('sample_count').notNullable().defaultTo(0) + table.text('notes').nullable() + table.timestamp('created_at').notNullable() + table.timestamp('updated_at').notNullable() + }) + + const now = DateTime.utc().toSQL({ includeOffset: false }) as string + const rows = SEED_ROWS.map((row) => ({ ...row, created_at: now, updated_at: now })) + this.defer(async (db) => { + await db.table(this.tableName).multiInsert(rows) + }) + } + + async down() { + this.schema.dropTable(this.tableName) + } +} diff --git a/admin/tests/unit/kb_ratio_lookup.spec.ts b/admin/tests/unit/kb_ratio_lookup.spec.ts new file mode 100644 index 0000000..08c3350 --- /dev/null +++ b/admin/tests/unit/kb_ratio_lookup.spec.ts @@ -0,0 +1,62 @@ +import * as assert from 'node:assert/strict' +import { test } from 'node:test' + +import { estimateChunkCount, findChunksPerMb } from '../../app/utils/kb_ratio_lookup.js' + +const SEEDED_ROWS = [ + { pattern: 'devdocs_', chunks_per_mb: 1100 }, + { pattern: 'wikipedia_en_simple_', chunks_per_mb: 270 }, + { pattern: 'wikipedia_en_', chunks_per_mb: 250 }, + { pattern: 'ifixit_', chunks_per_mb: 50 }, + { pattern: 'lrnselfreliance_', chunks_per_mb: 0 }, + { pattern: '', chunks_per_mb: 100 }, +] + +test('exact prefix match', () => { + assert.equal(findChunksPerMb('devdocs_en_python_2026-02.zim', SEEDED_ROWS), 1100) +}) + +test('longest-prefix wins over broader sibling', () => { + // wikipedia_en_simple_* should pick 270, not the 250 from wikipedia_en_ + assert.equal( + findChunksPerMb('wikipedia_en_simple_all_nopic_2026-02.zim', SEEDED_ROWS), + 270 + ) +}) + +test('broader prefix used when no specific match', () => { + // wikipedia_en_medicine_* is not seeded; falls through to wikipedia_en_ at 250 + assert.equal(findChunksPerMb('wikipedia_en_medicine_nopic_2026-04.zim', SEEDED_ROWS), 250) +}) + +test('empty-string fallback catches unmatched filenames', () => { + assert.equal(findChunksPerMb('something_unknown_2026-02.zim', SEEDED_ROWS), 100) +}) + +test('returns null when no row matches and no fallback is registered', () => { + const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '') + assert.equal(findChunksPerMb('something_unknown_2026-02.zim', rowsWithoutFallback), null) +}) + +test('zero-ratio entry returns 0, not null (video-only ZIMs)', () => { + assert.equal(findChunksPerMb('lrnselfreliance_en_all_2025-12.zim', SEEDED_ROWS), 0) +}) + +test('estimateChunkCount scales by file size in MB', () => { + // 100 MB * 1100 chunks/MB ≈ 110,000 chunks for devdocs + const bytes = 100 * 1024 * 1024 + assert.equal(estimateChunkCount('devdocs_en_python_2026-02.zim', bytes, SEEDED_ROWS), 110000) +}) + +test('estimateChunkCount returns 0 for video-only ZIM regardless of size', () => { + const bytes = 5 * 1024 * 1024 * 1024 // 5 GB + assert.equal(estimateChunkCount('lrnselfreliance_en_all_2025-12.zim', bytes, SEEDED_ROWS), 0) +}) + +test('estimateChunkCount returns null when no match and no fallback', () => { + const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '') + assert.equal( + estimateChunkCount('something_unknown_2026-02.zim', 50 * 1024 * 1024, rowsWithoutFallback), + null + ) +})