feat(KB): ratio registry for disk + time estimates (Phase 1B of RFC #883)

Foundation for the cost estimates and partial-stall detection that Phase 2 will surface. No consumers yet — this PR just lays the table, the seed rows, and the lookup helper so subsequent UI work has estimates available without a per-ZIM benchmark. ## What lands - New table `kb_ratio_registry` (pattern, chunks_per_mb, sample_count, notes). Migration creates and seeds heuristic defaults from the RFC appendix: devdocs (1100/MB), Wikipedia variants (270/MB), iFixit (50/MB), Stack Exchange Q&A (200/MB), video-only ZIMs (0), plus a catch-all fallback at 100/MB. - `KbRatioRegistry` model with static `lookup()` and `estimateChunks()`. - Pure helper `kb_ratio_lookup.ts` doing longest-prefix-match — a specific entry (`wikipedia_en_simple_`) overrides a broader one (`wikipedia_en_`). 9 unit tests covering the lookup boundary. - `sample_count` starts at 0 (heuristic seed) and is reserved for Phase 4 self-calibration to increment as observed ZIMs update each row. ## Not in scope - Self-calibration on successful ingestion (Phase 4) - UI consumers — Warning B (partial-embed stall) and the storage budget meter / time estimates land in Phase 2. ## Tested - Type-check clean - 9 unit tests pass for `findChunksPerMb` and `estimateChunkCount` - Migration applied on NOMAD3 via hot-patch; 9 seed rows verified in DB
2026-05-30 16:16:50 +02:00 · 2026-05-14 12:15:46 -07:00 · 2026-05-14 12:15:46 -07:00 · 68e1bd5ff2
commit 68e1bd5ff2
parent 8ce5790ab5
4 changed files with 221 additions and 0 deletions
--- a/admin/app/models/kb_ratio_registry.ts
+++ b/admin/app/models/kb_ratio_registry.ts
@ -0,0 +1,51 @@
+import { DateTime } from 'luxon'
+import { BaseModel, column, SnakeCaseNamingStrategy } from '@adonisjs/lucid/orm'
+import { findChunksPerMb, estimateChunkCount } from '../utils/kb_ratio_lookup.js'
+
+/**
+ * Self-calibrating registry of `{filename-prefix → chunks_per_mb}` ratios used
+ * for disk-footprint and time-to-embed estimates surfaced in the KB panel.
+ *
+ * Migration seeds the registry with heuristic defaults from the RFC #883
+ * appendix; Phase 4 self-calibration will update rows in place as ZIMs finish
+ * ingesting and the real ratio becomes known. Lookup is longest-prefix-match
+ * (see `kb_ratio_lookup.ts`) so a specific entry (`wikipedia_en_simple_`)
+ * overrides a broader one (`wikipedia_en_`).
+ */
+export default class KbRatioRegistry extends BaseModel {
+  static table = 'kb_ratio_registry'
+  static namingStrategy = new SnakeCaseNamingStrategy()
+
+  @column({ isPrimary: true })
+  declare id: number
+
+  @column()
+  declare pattern: string
+
+  @column()
+  declare chunks_per_mb: number
+
+  @column()
+  declare sample_count: number
+
+  @column()
+  declare notes: string | null
+
+  @column.dateTime({ autoCreate: true })
+  declare created_at: DateTime
+
+  @column.dateTime({ autoCreate: true, autoUpdate: true })
+  declare updated_at: DateTime
+
+  /** Look up chunks_per_mb for a filename by longest-prefix match. */
+  static async lookup(filename: string): Promise<number | null> {
+    const rows = await this.all()
+    return findChunksPerMb(filename, rows)
+  }
+
+  /** Estimate total chunks for a file of the given size on disk. */
+  static async estimateChunks(filename: string, fileSizeBytes: number): Promise<number | null> {
+    const rows = await this.all()
+    return estimateChunkCount(filename, fileSizeBytes, rows)
+  }
+}
--- a/admin/app/utils/kb_ratio_lookup.ts
+++ b/admin/app/utils/kb_ratio_lookup.ts
@ -0,0 +1,44 @@
+export interface RatioRow {
+  pattern: string
+  chunks_per_mb: number
+}
+
+/**
+ * Pick the chunks_per_mb estimate for a filename by longest-prefix match.
+ *
+ * Patterns are filename prefixes (`devdocs_`, `wikipedia_en_simple_`, ...).
+ * The longest matching prefix wins, so a specific entry (`wikipedia_en_simple_`)
+ * overrides the broader fallback (`wikipedia_en_`). An empty-string pattern in
+ * the registry serves as a catch-all that matches every input.
+ *
+ * Returns `null` if no row matches and no empty-string fallback is present —
+ * caller decides whether to surface "unknown" or use its own default.
+ */
+export function findChunksPerMb(filename: string, rows: RatioRow[]): number | null {
+  let best: RatioRow | null = null
+  for (const row of rows) {
+    if (!filename.startsWith(row.pattern)) continue
+    if (best === null || row.pattern.length > best.pattern.length) {
+      best = row
+    }
+  }
+  return best === null ? null : best.chunks_per_mb
+}
+
+/**
+ * Estimate the number of embedding chunks a ZIM-style file will produce given
+ * its size on disk in bytes. Returns `null` when the registry has nothing to
+ * match against. Caller is responsible for converting the estimate into either
+ * a disk-footprint estimate (chunks × bytes-per-chunk in Qdrant) or a time
+ * estimate (chunks ÷ chunks-per-minute-on-this-hardware).
+ */
+export function estimateChunkCount(
+  filename: string,
+  fileSizeBytes: number,
+  rows: RatioRow[]
+): number | null {
+  const ratio = findChunksPerMb(filename, rows)
+  if (ratio === null) return null
+  const megabytes = fileSizeBytes / (1024 * 1024)
+  return Math.round(ratio * megabytes)
+}
--- a/admin/database/migrations/1776100000001_create_kb_ratio_registry_table.ts
+++ b/admin/database/migrations/1776100000001_create_kb_ratio_registry_table.ts
@ -0,0 +1,64 @@
+import { BaseSchema } from '@adonisjs/lucid/schema'
+import { DateTime } from 'luxon'
+
+const SEED_ROWS: Array<{ pattern: string; chunks_per_mb: number; notes: string }> = [
+  // Dense technical reference — every paragraph carries content
+  { pattern: 'devdocs_', chunks_per_mb: 1100, notes: 'Heuristic seed: dense API references' },
+  // Encyclopedia prose — Simple English & general Wikipedia variants
+  {
+    pattern: 'wikipedia_en_simple_',
+    chunks_per_mb: 270,
+    notes: 'Heuristic seed: Simple English Wikipedia',
+  },
+  {
+    pattern: 'wikipedia_en_',
+    chunks_per_mb: 270,
+    notes: 'Heuristic seed: general Wikipedia variants',
+  },
+  // Sparse text, image-heavy
+  { pattern: 'ifixit_', chunks_per_mb: 50, notes: 'Heuristic seed: image-heavy repair guides' },
+  // Q&A pages — moderate density, mostly short answers
+  {
+    pattern: 'cooking.stackexchange.com_',
+    chunks_per_mb: 200,
+    notes: 'Heuristic seed: Stack Exchange Q&A',
+  },
+  // Video-only ZIMs produce zero text chunks. Listing these explicitly keeps
+  // the cost estimator from spinning up "indexing in progress" UI for content
+  // that has no embeddable text whatsoever.
+  { pattern: 'lrnselfreliance_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
+  { pattern: 'ted_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
+  { pattern: 'freedom-of-religion_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
+  // Empty-pattern fallback — every filename startsWith('') is true. The lookup
+  // picks the longest matching pattern, so this only fires for ZIMs that match
+  // none of the above (medium prose density).
+  { pattern: '', chunks_per_mb: 100, notes: 'Heuristic fallback' },
+]
+
+export default class extends BaseSchema {
+  protected tableName = 'kb_ratio_registry'
+
+  async up() {
+    this.schema.createTable(this.tableName, (table) => {
+      table.increments('id').primary()
+      table.string('pattern', 255).notNullable().unique()
+      table.decimal('chunks_per_mb', 10, 2).notNullable()
+      // 0 = heuristic seed, >0 = number of observed ZIMs that have updated this entry.
+      // Phase 4 self-calibration increments this on each successful ingestion.
+      table.integer('sample_count').notNullable().defaultTo(0)
+      table.text('notes').nullable()
+      table.timestamp('created_at').notNullable()
+      table.timestamp('updated_at').notNullable()
+    })
+
+    const now = DateTime.utc().toSQL({ includeOffset: false }) as string
+    const rows = SEED_ROWS.map((row) => ({ ...row, created_at: now, updated_at: now }))
+    this.defer(async (db) => {
+      await db.table(this.tableName).multiInsert(rows)
+    })
+  }
+
+  async down() {
+    this.schema.dropTable(this.tableName)
+  }
+}
--- a/admin/tests/unit/kb_ratio_lookup.spec.ts
+++ b/admin/tests/unit/kb_ratio_lookup.spec.ts
@ -0,0 +1,62 @@
+import * as assert from 'node:assert/strict'
+import { test } from 'node:test'
+
+import { estimateChunkCount, findChunksPerMb } from '../../app/utils/kb_ratio_lookup.js'
+
+const SEEDED_ROWS = [
+  { pattern: 'devdocs_', chunks_per_mb: 1100 },
+  { pattern: 'wikipedia_en_simple_', chunks_per_mb: 270 },
+  { pattern: 'wikipedia_en_', chunks_per_mb: 250 },
+  { pattern: 'ifixit_', chunks_per_mb: 50 },
+  { pattern: 'lrnselfreliance_', chunks_per_mb: 0 },
+  { pattern: '', chunks_per_mb: 100 },
+]
+
+test('exact prefix match', () => {
+  assert.equal(findChunksPerMb('devdocs_en_python_2026-02.zim', SEEDED_ROWS), 1100)
+})
+
+test('longest-prefix wins over broader sibling', () => {
+  // wikipedia_en_simple_* should pick 270, not the 250 from wikipedia_en_
+  assert.equal(
+    findChunksPerMb('wikipedia_en_simple_all_nopic_2026-02.zim', SEEDED_ROWS),
+    270
+  )
+})
+
+test('broader prefix used when no specific match', () => {
+  // wikipedia_en_medicine_* is not seeded; falls through to wikipedia_en_ at 250
+  assert.equal(findChunksPerMb('wikipedia_en_medicine_nopic_2026-04.zim', SEEDED_ROWS), 250)
+})
+
+test('empty-string fallback catches unmatched filenames', () => {
+  assert.equal(findChunksPerMb('something_unknown_2026-02.zim', SEEDED_ROWS), 100)
+})
+
+test('returns null when no row matches and no fallback is registered', () => {
+  const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '')
+  assert.equal(findChunksPerMb('something_unknown_2026-02.zim', rowsWithoutFallback), null)
+})
+
+test('zero-ratio entry returns 0, not null (video-only ZIMs)', () => {
+  assert.equal(findChunksPerMb('lrnselfreliance_en_all_2025-12.zim', SEEDED_ROWS), 0)
+})
+
+test('estimateChunkCount scales by file size in MB', () => {
+  // 100 MB * 1100 chunks/MB ≈ 110,000 chunks for devdocs
+  const bytes = 100 * 1024 * 1024
+  assert.equal(estimateChunkCount('devdocs_en_python_2026-02.zim', bytes, SEEDED_ROWS), 110000)
+})
+
+test('estimateChunkCount returns 0 for video-only ZIM regardless of size', () => {
+  const bytes = 5 * 1024 * 1024 * 1024 // 5 GB
+  assert.equal(estimateChunkCount('lrnselfreliance_en_all_2025-12.zim', bytes, SEEDED_ROWS), 0)
+})
+
+test('estimateChunkCount returns null when no match and no fallback', () => {
+  const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '')
+  assert.equal(
+    estimateChunkCount('something_unknown_2026-02.zim', 50 * 1024 * 1024, rowsWithoutFallback),
+    null
+  )
+})