feat(KB): ratio registry for disk + time estimates (Phase 1B of RFC #883)

Foundation for the cost estimates and partial-stall detection that
Phase 2 will surface. No consumers yet — this PR just lays the table,
the seed rows, and the lookup helper so subsequent UI work has
estimates available without a per-ZIM benchmark.

## What lands

- New table `kb_ratio_registry` (pattern, chunks_per_mb, sample_count,
  notes). Migration creates and seeds heuristic defaults from the RFC
  appendix: devdocs (1100/MB), Wikipedia variants (270/MB), iFixit
  (50/MB), Stack Exchange Q&A (200/MB), video-only ZIMs (0), plus a
  catch-all fallback at 100/MB.
- `KbRatioRegistry` model with static `lookup()` and `estimateChunks()`.
- Pure helper `kb_ratio_lookup.ts` doing longest-prefix-match — a
  specific entry (`wikipedia_en_simple_`) overrides a broader one
  (`wikipedia_en_`). 9 unit tests covering the lookup boundary.
- `sample_count` starts at 0 (heuristic seed) and is reserved for
  Phase 4 self-calibration to increment as observed ZIMs update each row.

## Not in scope

- Self-calibration on successful ingestion (Phase 4)
- UI consumers — Warning B (partial-embed stall) and the storage budget
  meter / time estimates land in Phase 2.

## Tested

- Type-check clean
- 9 unit tests pass for `findChunksPerMb` and `estimateChunkCount`
- Migration applied on NOMAD3 via hot-patch; 9 seed rows verified in DB
This commit is contained in:
Chris Sherwood 2026-05-14 12:15:46 -07:00 committed by Jake Turner
parent 8ce5790ab5
commit 68e1bd5ff2
4 changed files with 221 additions and 0 deletions

View File

@ -0,0 +1,51 @@
import { DateTime } from 'luxon'
import { BaseModel, column, SnakeCaseNamingStrategy } from '@adonisjs/lucid/orm'
import { findChunksPerMb, estimateChunkCount } from '../utils/kb_ratio_lookup.js'
/**
* Self-calibrating registry of `{filename-prefix → chunks_per_mb}` ratios used
* for disk-footprint and time-to-embed estimates surfaced in the KB panel.
*
* Migration seeds the registry with heuristic defaults from the RFC #883
* appendix; Phase 4 self-calibration will update rows in place as ZIMs finish
* ingesting and the real ratio becomes known. Lookup is longest-prefix-match
* (see `kb_ratio_lookup.ts`) so a specific entry (`wikipedia_en_simple_`)
* overrides a broader one (`wikipedia_en_`).
*/
export default class KbRatioRegistry extends BaseModel {
static table = 'kb_ratio_registry'
static namingStrategy = new SnakeCaseNamingStrategy()
@column({ isPrimary: true })
declare id: number
@column()
declare pattern: string
@column()
declare chunks_per_mb: number
@column()
declare sample_count: number
@column()
declare notes: string | null
@column.dateTime({ autoCreate: true })
declare created_at: DateTime
@column.dateTime({ autoCreate: true, autoUpdate: true })
declare updated_at: DateTime
/** Look up chunks_per_mb for a filename by longest-prefix match. */
static async lookup(filename: string): Promise<number | null> {
const rows = await this.all()
return findChunksPerMb(filename, rows)
}
/** Estimate total chunks for a file of the given size on disk. */
static async estimateChunks(filename: string, fileSizeBytes: number): Promise<number | null> {
const rows = await this.all()
return estimateChunkCount(filename, fileSizeBytes, rows)
}
}

View File

@ -0,0 +1,44 @@
export interface RatioRow {
pattern: string
chunks_per_mb: number
}
/**
* Pick the chunks_per_mb estimate for a filename by longest-prefix match.
*
* Patterns are filename prefixes (`devdocs_`, `wikipedia_en_simple_`, ...).
* The longest matching prefix wins, so a specific entry (`wikipedia_en_simple_`)
* overrides the broader fallback (`wikipedia_en_`). An empty-string pattern in
* the registry serves as a catch-all that matches every input.
*
* Returns `null` if no row matches and no empty-string fallback is present
* caller decides whether to surface "unknown" or use its own default.
*/
export function findChunksPerMb(filename: string, rows: RatioRow[]): number | null {
let best: RatioRow | null = null
for (const row of rows) {
if (!filename.startsWith(row.pattern)) continue
if (best === null || row.pattern.length > best.pattern.length) {
best = row
}
}
return best === null ? null : best.chunks_per_mb
}
/**
* Estimate the number of embedding chunks a ZIM-style file will produce given
* its size on disk in bytes. Returns `null` when the registry has nothing to
* match against. Caller is responsible for converting the estimate into either
* a disk-footprint estimate (chunks × bytes-per-chunk in Qdrant) or a time
* estimate (chunks ÷ chunks-per-minute-on-this-hardware).
*/
export function estimateChunkCount(
filename: string,
fileSizeBytes: number,
rows: RatioRow[]
): number | null {
const ratio = findChunksPerMb(filename, rows)
if (ratio === null) return null
const megabytes = fileSizeBytes / (1024 * 1024)
return Math.round(ratio * megabytes)
}

View File

@ -0,0 +1,64 @@
import { BaseSchema } from '@adonisjs/lucid/schema'
import { DateTime } from 'luxon'
const SEED_ROWS: Array<{ pattern: string; chunks_per_mb: number; notes: string }> = [
// Dense technical reference — every paragraph carries content
{ pattern: 'devdocs_', chunks_per_mb: 1100, notes: 'Heuristic seed: dense API references' },
// Encyclopedia prose — Simple English & general Wikipedia variants
{
pattern: 'wikipedia_en_simple_',
chunks_per_mb: 270,
notes: 'Heuristic seed: Simple English Wikipedia',
},
{
pattern: 'wikipedia_en_',
chunks_per_mb: 270,
notes: 'Heuristic seed: general Wikipedia variants',
},
// Sparse text, image-heavy
{ pattern: 'ifixit_', chunks_per_mb: 50, notes: 'Heuristic seed: image-heavy repair guides' },
// Q&A pages — moderate density, mostly short answers
{
pattern: 'cooking.stackexchange.com_',
chunks_per_mb: 200,
notes: 'Heuristic seed: Stack Exchange Q&A',
},
// Video-only ZIMs produce zero text chunks. Listing these explicitly keeps
// the cost estimator from spinning up "indexing in progress" UI for content
// that has no embeddable text whatsoever.
{ pattern: 'lrnselfreliance_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
{ pattern: 'ted_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
{ pattern: 'freedom-of-religion_', chunks_per_mb: 0, notes: 'Heuristic seed: video-only ZIM' },
// Empty-pattern fallback — every filename startsWith('') is true. The lookup
// picks the longest matching pattern, so this only fires for ZIMs that match
// none of the above (medium prose density).
{ pattern: '', chunks_per_mb: 100, notes: 'Heuristic fallback' },
]
export default class extends BaseSchema {
protected tableName = 'kb_ratio_registry'
async up() {
this.schema.createTable(this.tableName, (table) => {
table.increments('id').primary()
table.string('pattern', 255).notNullable().unique()
table.decimal('chunks_per_mb', 10, 2).notNullable()
// 0 = heuristic seed, >0 = number of observed ZIMs that have updated this entry.
// Phase 4 self-calibration increments this on each successful ingestion.
table.integer('sample_count').notNullable().defaultTo(0)
table.text('notes').nullable()
table.timestamp('created_at').notNullable()
table.timestamp('updated_at').notNullable()
})
const now = DateTime.utc().toSQL({ includeOffset: false }) as string
const rows = SEED_ROWS.map((row) => ({ ...row, created_at: now, updated_at: now }))
this.defer(async (db) => {
await db.table(this.tableName).multiInsert(rows)
})
}
async down() {
this.schema.dropTable(this.tableName)
}
}

View File

@ -0,0 +1,62 @@
import * as assert from 'node:assert/strict'
import { test } from 'node:test'
import { estimateChunkCount, findChunksPerMb } from '../../app/utils/kb_ratio_lookup.js'
const SEEDED_ROWS = [
{ pattern: 'devdocs_', chunks_per_mb: 1100 },
{ pattern: 'wikipedia_en_simple_', chunks_per_mb: 270 },
{ pattern: 'wikipedia_en_', chunks_per_mb: 250 },
{ pattern: 'ifixit_', chunks_per_mb: 50 },
{ pattern: 'lrnselfreliance_', chunks_per_mb: 0 },
{ pattern: '', chunks_per_mb: 100 },
]
test('exact prefix match', () => {
assert.equal(findChunksPerMb('devdocs_en_python_2026-02.zim', SEEDED_ROWS), 1100)
})
test('longest-prefix wins over broader sibling', () => {
// wikipedia_en_simple_* should pick 270, not the 250 from wikipedia_en_
assert.equal(
findChunksPerMb('wikipedia_en_simple_all_nopic_2026-02.zim', SEEDED_ROWS),
270
)
})
test('broader prefix used when no specific match', () => {
// wikipedia_en_medicine_* is not seeded; falls through to wikipedia_en_ at 250
assert.equal(findChunksPerMb('wikipedia_en_medicine_nopic_2026-04.zim', SEEDED_ROWS), 250)
})
test('empty-string fallback catches unmatched filenames', () => {
assert.equal(findChunksPerMb('something_unknown_2026-02.zim', SEEDED_ROWS), 100)
})
test('returns null when no row matches and no fallback is registered', () => {
const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '')
assert.equal(findChunksPerMb('something_unknown_2026-02.zim', rowsWithoutFallback), null)
})
test('zero-ratio entry returns 0, not null (video-only ZIMs)', () => {
assert.equal(findChunksPerMb('lrnselfreliance_en_all_2025-12.zim', SEEDED_ROWS), 0)
})
test('estimateChunkCount scales by file size in MB', () => {
// 100 MB * 1100 chunks/MB ≈ 110,000 chunks for devdocs
const bytes = 100 * 1024 * 1024
assert.equal(estimateChunkCount('devdocs_en_python_2026-02.zim', bytes, SEEDED_ROWS), 110000)
})
test('estimateChunkCount returns 0 for video-only ZIM regardless of size', () => {
const bytes = 5 * 1024 * 1024 * 1024 // 5 GB
assert.equal(estimateChunkCount('lrnselfreliance_en_all_2025-12.zim', bytes, SEEDED_ROWS), 0)
})
test('estimateChunkCount returns null when no match and no fallback', () => {
const rowsWithoutFallback = SEEDED_ROWS.filter((r) => r.pattern !== '')
assert.equal(
estimateChunkCount('something_unknown_2026-02.zim', 50 * 1024 * 1024, rowsWithoutFallback),
null
)
})