nehraa · nehraa · May 25, 2026 · May 18, 2026 · May 25, 2026 · May 25, 2026
diff --git a/.env.example b/.env.example
@@ -12,8 +12,8 @@ CODERAG_GEMINI_API_KEY=your_api_key_here
 # Compatibility alias also accepted: CODERAG_GEMINI_AI_KEY
 
 # Optional: Override the default Gemini embedding model
-# Default: models/gemini-embedding-001
-CODERAG_GEMINI_MODEL=models/gemini-embedding-001
+# Default: models/gemini-embedding-2
+CODERAG_GEMINI_MODEL=models/gemini-embedding-2
 
 # ============================================
 # EMBEDDING CONFIGURATION

diff --git a/README.md b/README.md
@@ -101,7 +101,7 @@ Supported environment overrides:
 - `CODERAG_CUSTOM_HTTP_FORMAT`
 - `CODERAG_LLM_HEADERS`
 
-When `embedding.provider` is `gemini`, CodeRag defaults to `models/gemini-embedding-001` and requests 768-dimensional vectors explicitly so the stored embedding fingerprint matches the vectors written to LanceDB. It accepts either `CODERAG_GEMINI_API_KEY` or the compatibility alias `CODERAG_GEMINI_AI_KEY`.
+When `embedding.provider` is `gemini`, CodeRag defaults to `models/gemini-embedding-2` and requests 768-dimensional vectors explicitly so the stored embedding fingerprint matches the vectors written to LanceDB. It accepts either `CODERAG_GEMINI_API_KEY` or the compatibility alias `CODERAG_GEMINI_AI_KEY`.
 
 When `embedding.provider` is `onnx`, CodeRag uses `Xenova/gte-small` (384-dim, ~33MB) running locally via `@xenova/transformers`. No API key or external server needed. The model must be downloaded to `<onnxModelDir>/Xenova/gte-small/` (default `.coderag-models/models/Xenova/gte-small/`).
 

diff --git a/src/cli/setup-wizard.ts b/src/cli/setup-wizard.ts
@@ -68,7 +68,7 @@ export const runSetupWizard = async (cwd: string, logger?: Logger): Promise<void
   };
   const embeddingProviderKind = providerMap[embeddingProvider] ?? "local-hash";
 
-  let geminiModel = "models/gemini-embedding-001";
+  let geminiModel = "models/gemini-embedding-2";
   let geminiApiKey = "";
   let onnxModelDir = ".coderag-models/models";
 

diff --git a/src/indexer/documents.ts b/src/indexer/documents.ts
@@ -20,7 +20,8 @@ const readExternalNodeDoc = async (nodeId: string, docsPath: string): Promise<st
 };
 
 const EMPTY_LIST = "- None";
-const MAX_EMBEDDING_CHARS = 2048; // Embedding models cap at ~512 tokens; extra chars waste memory
+/** ~4 chars per token is a safe estimate for mixed code/text content */
+const CHARS_PER_TOKEN = 4;
 
 const formatList = (items: string[]): string => (items.length > 0 ? items.map((item) => `- ${item}`).join("\n") : EMPTY_LIST);
 
@@ -259,9 +260,10 @@ export const buildIndexedDocuments = async (
         embeddingText = [doc, sourceText].filter(Boolean).join("\n\n");
       }
 
-      // Truncate to save memory — embedding models cap at ~512 tokens anyway
-      if (embeddingText.length > MAX_EMBEDDING_CHARS) {
-        embeddingText = embeddingText.slice(0, MAX_EMBEDDING_CHARS);
+      // Truncate to fit the model's token limit
+      const maxChars = embeddingProvider.maxInputTokens * CHARS_PER_TOKEN;
+      if (embeddingText.length > maxChars) {
+        embeddingText = embeddingText.slice(0, maxChars);
       }
 
       preparedForChunk.push({

diff --git a/src/indexer/embedder.ts b/src/indexer/embedder.ts
@@ -5,6 +5,8 @@ export class LocalHashEmbeddingProvider implements EmbeddingProvider {
   readonly name = "local-hash";
   readonly model = "local-hash";
   readonly dimensions: number;
+  /** Unlimited — hash-based embedding has no token limit. */
+  readonly maxInputTokens = Infinity;
 
   constructor(dimensions = 256) {
     this.dimensions = dimensions;

diff --git a/src/indexer/gemini-embedder.ts b/src/indexer/gemini-embedder.ts
@@ -2,7 +2,7 @@ import type { EmbeddingProvider } from "../types.js";
 import { ConfigurationError } from "../errors/index.js";
 
 const GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta";
-const DEFAULT_MODEL = "models/gemini-embedding-001";
+const DEFAULT_MODEL = "models/gemini-embedding-2";
 const DEFAULT_DIMENSIONS = 768;
 const MAX_BATCH_SIZE = 100;
 const GEMINI_API_KEY_ENV = "CODERAG_GEMINI_API_KEY";
@@ -23,6 +23,7 @@ export class GeminiEmbeddingProvider implements EmbeddingProvider {
   readonly name = "gemini";
   readonly dimensions = DEFAULT_DIMENSIONS;
   readonly maxBatchSize = MAX_BATCH_SIZE;
+  readonly maxInputTokens = 8192;
   readonly model: string;
   private readonly apiKey: string;
   private readonly timeoutMs: number;

diff --git a/src/indexer/onnx-embedder.ts b/src/indexer/onnx-embedder.ts
@@ -98,6 +98,7 @@ export class OnnxEmbeddingProvider implements EmbeddingProvider {
   readonly model = DEFAULT_MODEL;
   readonly dimensions = DEFAULT_DIMENSIONS;
   readonly maxBatchSize = 1; // One at a time to minimize memory pressure
+  readonly maxInputTokens = 256; // all-MiniLM-L6-v2 max sequence length
   private readonly modelDir: string;
   private readonly logger?: Logger;
 

diff --git a/src/llm/context-builder.ts b/src/llm/context-builder.ts
@@ -1,6 +1,13 @@
 import type { BlueprintNode } from "@abhinav2203/codeflow-core/schema";
 
-import type { ContextPackage, GraphSnapshot, IndexedNodeDocument, RetrievedNodeContext, RetrievalConfig } from "../types.js";
+import type {
+  ContextPackage,
+  GraphSnapshot,
+  IndexedNodeDocument,
+  RetrievedNodeContext,
+  RetrievalConfig
+} from "../types.js";
+import type { SectionLimits } from "./prompt.js";
 import { FileCache } from "../store/file-cache.js";
 import { createRetrievedNodeContext } from "../retrieval/page-index.js";
 
@@ -25,6 +32,36 @@ const buildGraphSummary = (
   return parts.join(" ");
 };
 
+/**
+ * Derives per-section char limits from retrieval config.
+ *
+ * Defaults are proportional to maxContextChars so they scale automatically.
+ * Explicit overrides (when the user sets primaryDocLimit, etc.) always take precedence.
+ *
+ * Default distribution for a 16K baseline:
+ *   primaryDoc  ->  1,200 (7.5%)
+ *   primaryFile ->  4,000 (25%)
+ *   relatedDoc  ->    320 (2%)
+ *   relatedFile ->  1,200 (7.5%)
+ * Remaining ~58% is for structural overhead (headers, warnings, graph summary).
+ */
+export const deriveSectionLimits = (retrieval: RetrievalConfig): SectionLimits => {
+  const mcc = retrieval.maxContextChars;
+
+  // Proportional defaults relative to a 16,000 baseline.
+  const primaryDocDefault = Math.max(1, Math.round((mcc / 16000) * 1200));
+  const primaryFileDefault = Math.max(1, Math.round((mcc / 16000) * 4000));
+  const relatedDocDefault = Math.max(1, Math.round((mcc / 16000) * 320));
+  const relatedFileDefault = Math.max(1, Math.round((mcc / 16000) * 1200));
+
+  return {
+    primaryDoc: retrieval.primaryDocLimit ?? primaryDocDefault,
+    primaryFile: retrieval.primaryFileLimit ?? primaryFileDefault,
+    relatedDoc: retrieval.relatedDocLimit ?? relatedDocDefault,
+    relatedFile: retrieval.relatedFileLimit ?? relatedFileDefault
+  };
+};
+
 const truncateContext = (context: RetrievedNodeContext, maxChars: number, warnings: string[]): RetrievedNodeContext => {
   if (context.fullFileContent.length <= maxChars) {
     return context;
@@ -115,6 +152,8 @@ const buildRelatedContextPromises = (
 
 /**
  * Builds the final context package passed to the LLM or returned directly to the caller.
+ *
+ * The caller receives `limits` so it can pass them through to `buildMessages()`.
  */
 export const buildContextPackage = async (
   question: string,
@@ -127,7 +166,7 @@ export const buildContextPackage = async (
   dependencies: BlueprintNode[],
   dependents: BlueprintNode[],
   answerMode: ContextPackage["answerMode"]
-): Promise<ContextPackage> => {
+): Promise<{ context: ContextPackage; limits: SectionLimits }> => {
   const primaryDocument = primaryNode ? documents[primaryNode.id] : undefined;
   const primaryContext = primaryDocument
     ? await createRetrievedNodeContext(repoPath, fileCache, snapshot, primaryDocument, "primary")
@@ -138,13 +177,18 @@ export const buildContextPackage = async (
   const primaryResult = fitPrimaryContext(primaryContext, retrieval.maxContextChars);
   const relatedResult = fitRelatedContexts(resolvedRelatedContexts, primaryResult.remainingBudget);
 
+  const limits = deriveSectionLimits(retrieval);
+
   return {
-    question,
-    answerMode,
-    retrievalMode: "single" as const,
-    primaryNode: primaryResult.primaryContext,
-    relatedNodes: relatedResult.relatedContexts,
-    graphSummary: buildGraphSummary(primaryNode, dependencies, dependents),
-    warnings: [...primaryResult.warnings, ...relatedResult.warnings]
+    context: {
+      question,
+      answerMode,
+      retrievalMode: "single" as const,
+      primaryNode: primaryResult.primaryContext,
+      relatedNodes: relatedResult.relatedContexts,
+      graphSummary: buildGraphSummary(primaryNode, dependencies, dependents),
+      warnings: [...primaryResult.warnings, ...relatedResult.warnings]
+    },
+    limits
   };
-};
+};
diff --git a/src/llm/multi-hop-context-builder.ts b/src/llm/multi-hop-context-builder.ts
@@ -5,9 +5,11 @@ import type {
   GraphSnapshot,
   IndexedNodeDocument,
   MultiHopRetrievalResult,
+  RetrievedNodeContext,
   RetrievalConfig
 } from "../types.js";
-import type { RetrievedNodeContext } from "../types.js";
+import type { SectionLimits } from "./prompt.js";
+import { deriveSectionLimits } from "./context-builder.js";
 import { FileCache } from "../store/file-cache.js";
 import { createRetrievedNodeContext } from "../retrieval/page-index.js";
 
@@ -77,6 +79,8 @@ const buildRelatedNodeContexts = async (
  * Unlike the single-node path, there is no single primary node.
  * The first retrieved node is promoted to "primary" for display purposes,
  * and all remaining nodes are listed as related.
+ *
+ * Returns both the context and the derived section limits for prompt building.
  */
 export const buildMultiHopContextPackage = async (
   question: string,
@@ -87,10 +91,9 @@ export const buildMultiHopContextPackage = async (
   documents: Record<string, IndexedNodeDocument>,
   retrieval: RetrievalConfig,
   fileCache: FileCache
-): Promise<ContextPackage> => {
+): Promise<{ context: ContextPackage; limits: SectionLimits }> => {
   const allNodes = retrievalResult.deduplicatedNodes;
 
-  // Build RetrievedNodeContext for all deduplicated nodes
   const allContexts = await buildRelatedNodeContexts(
     allNodes,
     repoPath,
@@ -99,14 +102,12 @@ export const buildMultiHopContextPackage = async (
     documents
   );
 
-  // Promote the first node to "primary" for display
   const firstCtx = allContexts[0];
   const primaryContext: RetrievedNodeContext | null = firstCtx
     ? Object.assign({}, firstCtx, { relationship: "primary" as const, subQuestionIndex: undefined })
     : null;
   const relatedContexts: RetrievedNodeContext[] = allContexts.length > 1 ? allContexts.slice(1) : [];
 
-  // Apply context budgeting
   const warnings: string[] = [];
   let remainingBudget = retrieval.maxContextChars;
 
@@ -125,6 +126,7 @@ export const buildMultiHopContextPackage = async (
   const fittedRelated: RetrievedNodeContext[] = [];
   for (const ctx of relatedContexts) {
     if (remainingBudget <= 0) {
+      warnings.push(`Dropped file content for ${ctx.filePath} because the context budget was exhausted.`);
       fittedRelated.push({ ...ctx, fullFileContent: "" });
       continue;
     }
@@ -149,15 +151,20 @@ export const buildMultiHopContextPackage = async (
     filesReferenced: meta.filesReferenced
   }));
 
+  const limits = deriveSectionLimits(retrieval);
+
   return {
-    question,
-    answerMode: "llm" as const,
-    retrievalMode: "multi-hop" as const,
-    primaryNode: fittedPrimary,
-    relatedNodes: fittedRelated,
-    graphSummary: buildMultiHopGraphSummary(subQuestions, retrievalResult, snapshot),
-    warnings,
-    subQuestions,
-    subQuestionResults
+    context: {
+      question,
+      answerMode: "llm" as const,
+      retrievalMode: "multi-hop" as const,
+      primaryNode: fittedPrimary,
+      relatedNodes: fittedRelated,
+      graphSummary: buildMultiHopGraphSummary(subQuestions, retrievalResult, snapshot),
+      warnings,
+      subQuestions,
+      subQuestionResults
+    },
+    limits
   };
-};
+};