From 2dec5bf676be90a7b34940597df959b1ecb62bbd Mon Sep 17 00:00:00 2001
From: Chris Sherwood <chris@crosstalksolutions.com>
Date: Thu, 14 May 2026 12:11:43 -0700
Subject: [PATCH] fix(AI): pre-cap embed input + log fallback reason (#881)

The OpenAI-compatible /v1/embeddings fallback path can't pass
`truncate:true` / `num_ctx:8192` to the model, so any chunk that
exceeds the model's loaded context_length (often 2048 for
nomic-embed-text:v1.5) returns a 400 BadRequestError and is silently
dropped from Qdrant. Two CPU-only ingestion runs on NOMAD1 hit this
on dense technical content (medlineplus, arduino.stackexchange) even
after PR #763's num_ctx fix on the native path.

Pre-cap each input string at 4000 chars before either backend call.
That's ~1000-2000 tokens depending on density, comfortably under the
model's 2048 default. The chunker in RagService is sized for
MAX_SAFE_TOKENS=1600 (3200 chars at its conservative 2 chars/token
estimate), so well-formed inputs are never touched; this is purely a
runtime safety net for the edge cases that slip through.

Also stop swallowing the original error in the catch. The bare
`} catch {}` here has masked recurring "input length exceeds context
length" failures for months (#369, #670, #881). Capture and warn-log
the message so future investigations see why we fell back.

Same root cause as #369 and #670 which were closed without an actual
fix to the fallback path.
---
 admin/app/services/ollama_service.ts | 43 ++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/admin/app/services/ollama_service.ts b/admin/app/services/ollama_service.ts
index 78fbf69..d5a4d7e 100644
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@@ -469,6 +469,18 @@ export class OllamaService {
     }
   }
 
+  /**
+   * Hard char cap per embed input, applied as a runtime safety net regardless of
+   * which backend path runs. The chunker in RagService caps at MAX_SAFE_TOKENS=1600
+   * (3200 chars at the conservative 2 chars/token estimate), but dense technical
+   * content has been observed to slip past on multi-batch ZIM ingestion (#881).
+   *
+   * 4000 chars ≈ 1000–2000 tokens depending on density, which keeps us comfortably
+   * under nomic-embed-text:v1.5's default 2048-token context even on the OpenAI-compat
+   * fallback path (which can't pass `truncate:true`/`num_ctx` to the model).
+   */
+  public static readonly EMBED_MAX_INPUT_CHARS = 4000
+
   /**
    * Generate embeddings for the given input strings.
    * Tries the Ollama native /api/embed endpoint first, falls back to /v1/embeddings.
@@ -479,6 +491,16 @@ export class OllamaService {
       throw new Error('AI service is not initialized.')
     }
 
+    // Runtime safety net (#881). The OpenAI-compat fallback has no equivalent of
+    // truncate:true, so a chunk that exceeds the model's loaded context_length
+    // (often 2048 for nomic-embed-text:v1.5) returns 400 and the chunk is silently
+    // dropped from Qdrant. Pre-capping at the input layer protects both paths.
+    const safeInput = input.map((s) =>
+      s.length > OllamaService.EMBED_MAX_INPUT_CHARS
+        ? s.slice(0, OllamaService.EMBED_MAX_INPUT_CHARS)
+        : s
+    )
+
     try {
       // Prefer Ollama native endpoint (supports batch input natively).
       // Pass num_ctx explicitly so we don't depend on the embedding model's
@@ -491,7 +513,7 @@ export class OllamaService {
         `${this.baseUrl}/api/embed`,
         {
           model,
-          input,
+          input: safeInput,
           truncate: true,
           options: { num_ctx: 8192 },
         },
@@ -503,12 +525,23 @@ export class OllamaService {
         throw new Error('Invalid /api/embed response — missing embeddings array')
       }
       return { embeddings: response.data.embeddings }
-    } catch {
-      // Fall back to OpenAI-compatible /v1/embeddings
+    } catch (err) {
+      // Capture the original error so we know *why* we fell back. Earlier bare
+      // catches here masked recurring "input length exceeds context length"
+      // failures for months (#369, #670, #881) — without this log we have no
+      // signal that /api/embed is the broken path vs the fallback.
+      logger.warn(
+        '[OllamaService] /api/embed failed, falling back to /v1/embeddings: %s',
+        err instanceof Error ? err.message : String(err)
+      )
+      // Fall back to OpenAI-compatible /v1/embeddings.
       // Explicitly request float format — some backends (e.g. LM Studio) don't reliably
       // implement the base64 encoding the OpenAI SDK requests by default.
-      logger.info('[OllamaService] /api/embed unavailable, falling back to /v1/embeddings')
-      const results = await this.openai.embeddings.create({ model, input, encoding_format: 'float' })
+      const results = await this.openai.embeddings.create({
+        model,
+        input: safeInput,
+        encoding_format: 'float',
+      })
       return { embeddings: results.data.map((e) => e.embedding as number[]) }
     }
   }