From dab7871cb4ed892eb28df4038884c2947e46a915 Mon Sep 17 00:00:00 2001
From: Rodin <rodin@forgedthought.ai>
Date: Fri, 1 May 2026 18:59:07 -0700
Subject: [PATCH] fix: address review findings on budget system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Account for truncation marker tokens when computing diff budget
  (prevents EstTokens exceeding model limit in edge cases)
- Rune-safe truncation for both UserMeta and Diff (no split multi-byte)
- Fix misleading comment (1000 chars → ~1000 tokens/4000 chars)
- Extract marker strings as constants
- Add unit tests for BuildSystemBase and BuildUserMeta
---
 budget/budget.go      | 39 ++++++++++++++++++++++++++++++++++-----
 review/prompt_test.go | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/budget/budget.go b/budget/budget.go
index 449687e..cf66222 100644
--- a/budget/budget.go
+++ b/budget/budget.go
@@ -33,6 +33,10 @@ const defaultLimit = 128_000
 // reserveTokens is headroom for the response generation.
 const reserveTokens = 4_000
 
+const diffTruncMarker = "\n\n... [diff truncated due to context limit] ..."
+const diffTooLargeMarker = "... [diff too large for context window — review manually] ..."
+const userMetaTruncMarker = "\n... [description truncated] ..."
+
 // EstimateTokens estimates the number of tokens in a string.
 // Uses the rough heuristic of ~4 characters per token, which is
 // conservative for English text and code.
@@ -79,9 +83,9 @@ func Fit(model string, sections Sections) Result {
 	baseTokens := EstimateTokens(sections.SystemBase) + EstimateTokens(sections.UserMeta)
 	available := limit - baseTokens
 	if available < 0 {
-		// Base content alone exceeds budget. Truncate UserMeta (keep first 1000 chars).
+		// Base content alone exceeds budget. Truncate UserMeta (keep first ~1000 tokens).
 		if len(sections.UserMeta) > 4000 {
-			sections.UserMeta = sections.UserMeta[:4000] + "\n... [description truncated] ..."
+			sections.UserMeta = truncateUTF8(sections.UserMeta, 4000) + userMetaTruncMarker
 			baseTokens = EstimateTokens(sections.SystemBase) + EstimateTokens(sections.UserMeta)
 			available = limit - baseTokens
 		}
@@ -138,14 +142,20 @@ func Fit(model string, sections Sections) Result {
 		if diffBudget < 0 {
 			diffBudget = 0
 		}
-		maxChars := diffBudget * 4
+		// Reserve space for truncation marker
+		markerBudget := EstimateTokens(diffTruncMarker)
+		effectiveBudget := diffBudget - markerBudget
+		if effectiveBudget < 0 {
+			effectiveBudget = 0
+		}
+		maxChars := effectiveBudget * 4
 		if maxChars < len(sections.Diff) {
 			removed := EstimateTokens(sections.Diff) - diffBudget
 			trimmed = append(trimmed, fmt.Sprintf("diff truncated (~%dK tokens removed)", removed/1000))
 			if maxChars > 0 {
-				sections.Diff = sections.Diff[:maxChars] + "\n\n... [diff truncated due to context limit] ..."
+				sections.Diff = truncateUTF8(sections.Diff, maxChars) + diffTruncMarker
 			} else {
-				sections.Diff = "... [diff too large for context window — review manually] ..."
+				sections.Diff = diffTooLargeMarker
 			}
 		}
 	}
@@ -195,3 +205,22 @@ func buildResult(s Sections, trimmed []string, estTokens int) Result {
 		EstTokens:    estTokens,
 	}
 }
+
+// truncateUTF8 truncates s to at most maxBytes without splitting multi-byte
+// UTF-8 characters. Returns a valid UTF-8 string of at most maxBytes bytes.
+func truncateUTF8(s string, maxBytes int) string {
+	if len(s) <= maxBytes {
+		return s
+	}
+	// Walk backwards from maxBytes to find a valid UTF-8 boundary
+	for maxBytes > 0 && !isUTF8Start(s[maxBytes]) {
+		maxBytes--
+	}
+	return s[:maxBytes]
+}
+
+// isUTF8Start returns true if b is a valid start byte for a UTF-8 sequence
+// (single-byte ASCII or multi-byte lead byte, not a continuation byte).
+func isUTF8Start(b byte) bool {
+	return b&0xC0 != 0x80
+}
diff --git a/review/prompt_test.go b/review/prompt_test.go
index c3c1d2a..b8d705e 100644
--- a/review/prompt_test.go
+++ b/review/prompt_test.go
@@ -116,3 +116,43 @@ func TestBuildUserPrompt_WithoutFileContext(t *testing.T) {
 		t.Error("should not include file context section when empty")
 	}
 }
+
+
+func TestBuildSystemBase(t *testing.T) {
+	result := BuildSystemBase()
+	if result == "" {
+		t.Fatal("BuildSystemBase returned empty string")
+	}
+	if !strings.Contains(result, "expert code reviewer") {
+		t.Error("expected reviewer role in system base")
+	}
+	if !strings.Contains(result, "REQUEST_CHANGES") {
+		t.Error("expected verdict format in system base")
+	}
+	if !strings.Contains(result, "JSON") {
+		t.Error("expected JSON output instruction in system base")
+	}
+}
+
+func TestBuildUserMeta(t *testing.T) {
+	result := BuildUserMeta("Fix bug", "Some description", true, "all checks passed")
+	if !strings.Contains(result, "Fix bug") {
+		t.Error("expected title in user meta")
+	}
+	if !strings.Contains(result, "Some description") {
+		t.Error("expected description in user meta")
+	}
+	if !strings.Contains(result, "PASSED") {
+		t.Error("expected CI PASSED status")
+	}
+}
+
+func TestBuildUserMeta_CIFailed(t *testing.T) {
+	result := BuildUserMeta("Title", "", false, "test job failed")
+	if !strings.Contains(result, "FAILED") {
+		t.Error("expected CI FAILED status")
+	}
+	if strings.Contains(result, "Description") {
+		t.Error("expected no description section when empty")
+	}
+}