From dab7871cb4ed892eb28df4038884c2947e46a915 Mon Sep 17 00:00:00 2001 From: Rodin Date: Fri, 1 May 2026 18:59:07 -0700 Subject: [PATCH] fix: address review findings on budget system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Account for truncation marker tokens when computing diff budget (prevents EstTokens exceeding model limit in edge cases) - Rune-safe truncation for both UserMeta and Diff (no split multi-byte) - Fix misleading comment (1000 chars → ~1000 tokens/4000 chars) - Extract marker strings as constants - Add unit tests for BuildSystemBase and BuildUserMeta --- budget/budget.go | 39 ++++++++++++++++++++++++++++++++++----- review/prompt_test.go | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/budget/budget.go b/budget/budget.go index 449687e..cf66222 100644 --- a/budget/budget.go +++ b/budget/budget.go @@ -33,6 +33,10 @@ const defaultLimit = 128_000 // reserveTokens is headroom for the response generation. const reserveTokens = 4_000 +const diffTruncMarker = "\n\n... [diff truncated due to context limit] ..." +const diffTooLargeMarker = "... [diff too large for context window — review manually] ..." +const userMetaTruncMarker = "\n... [description truncated] ..." + // EstimateTokens estimates the number of tokens in a string. // Uses the rough heuristic of ~4 characters per token, which is // conservative for English text and code. @@ -79,9 +83,9 @@ func Fit(model string, sections Sections) Result { baseTokens := EstimateTokens(sections.SystemBase) + EstimateTokens(sections.UserMeta) available := limit - baseTokens if available < 0 { - // Base content alone exceeds budget. Truncate UserMeta (keep first 1000 chars). + // Base content alone exceeds budget. Truncate UserMeta (keep first ~1000 tokens). if len(sections.UserMeta) > 4000 { - sections.UserMeta = sections.UserMeta[:4000] + "\n... [description truncated] ..." + sections.UserMeta = truncateUTF8(sections.UserMeta, 4000) + userMetaTruncMarker baseTokens = EstimateTokens(sections.SystemBase) + EstimateTokens(sections.UserMeta) available = limit - baseTokens } @@ -138,14 +142,20 @@ func Fit(model string, sections Sections) Result { if diffBudget < 0 { diffBudget = 0 } - maxChars := diffBudget * 4 + // Reserve space for truncation marker + markerBudget := EstimateTokens(diffTruncMarker) + effectiveBudget := diffBudget - markerBudget + if effectiveBudget < 0 { + effectiveBudget = 0 + } + maxChars := effectiveBudget * 4 if maxChars < len(sections.Diff) { removed := EstimateTokens(sections.Diff) - diffBudget trimmed = append(trimmed, fmt.Sprintf("diff truncated (~%dK tokens removed)", removed/1000)) if maxChars > 0 { - sections.Diff = sections.Diff[:maxChars] + "\n\n... [diff truncated due to context limit] ..." + sections.Diff = truncateUTF8(sections.Diff, maxChars) + diffTruncMarker } else { - sections.Diff = "... [diff too large for context window — review manually] ..." + sections.Diff = diffTooLargeMarker } } } @@ -195,3 +205,22 @@ func buildResult(s Sections, trimmed []string, estTokens int) Result { EstTokens: estTokens, } } + +// truncateUTF8 truncates s to at most maxBytes without splitting multi-byte +// UTF-8 characters. Returns a valid UTF-8 string of at most maxBytes bytes. +func truncateUTF8(s string, maxBytes int) string { + if len(s) <= maxBytes { + return s + } + // Walk backwards from maxBytes to find a valid UTF-8 boundary + for maxBytes > 0 && !isUTF8Start(s[maxBytes]) { + maxBytes-- + } + return s[:maxBytes] +} + +// isUTF8Start returns true if b is a valid start byte for a UTF-8 sequence +// (single-byte ASCII or multi-byte lead byte, not a continuation byte). +func isUTF8Start(b byte) bool { + return b&0xC0 != 0x80 +} diff --git a/review/prompt_test.go b/review/prompt_test.go index c3c1d2a..b8d705e 100644 --- a/review/prompt_test.go +++ b/review/prompt_test.go @@ -116,3 +116,43 @@ func TestBuildUserPrompt_WithoutFileContext(t *testing.T) { t.Error("should not include file context section when empty") } } + + +func TestBuildSystemBase(t *testing.T) { + result := BuildSystemBase() + if result == "" { + t.Fatal("BuildSystemBase returned empty string") + } + if !strings.Contains(result, "expert code reviewer") { + t.Error("expected reviewer role in system base") + } + if !strings.Contains(result, "REQUEST_CHANGES") { + t.Error("expected verdict format in system base") + } + if !strings.Contains(result, "JSON") { + t.Error("expected JSON output instruction in system base") + } +} + +func TestBuildUserMeta(t *testing.T) { + result := BuildUserMeta("Fix bug", "Some description", true, "all checks passed") + if !strings.Contains(result, "Fix bug") { + t.Error("expected title in user meta") + } + if !strings.Contains(result, "Some description") { + t.Error("expected description in user meta") + } + if !strings.Contains(result, "PASSED") { + t.Error("expected CI PASSED status") + } +} + +func TestBuildUserMeta_CIFailed(t *testing.T) { + result := BuildUserMeta("Title", "", false, "test job failed") + if !strings.Contains(result, "FAILED") { + t.Error("expected CI FAILED status") + } + if strings.Contains(result, "Description") { + t.Error("expected no description section when empty") + } +}