fix: repair unescaped quotes in LLM JSON responses #45
+233
-1
@@ -29,7 +29,12 @@ func ParseResponse(response string) (*ReviewResult, error) {
|
||||
|
||||
var result ReviewResult
|
||||
if err := json.Unmarshal([]byte(cleaned), &result); err != nil {
|
||||
return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response)
|
||||
// LLMs sometimes produce JSON with unescaped quotes inside string values.
|
||||
// Try to repair before giving up.
|
||||
repaired := repairJSON(cleaned)
|
||||
if err2 := json.Unmarshal([]byte(repaired), &result); err2 != nil {
|
||||
return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response)
|
||||
|
|
||||
}
|
||||
}
|
||||
|
||||
// Validate verdict
|
||||
@@ -74,3 +79,230 @@ func extractJSON(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
|
sonnet-review-bot
commented
[NIT] The comment above repairJSON says it will "pick the LAST" valid closing quote to maximize content, but findClosingQuote actually returns the FIRST candidate that yields valid continuation. Align the comment with the implementation or adjust the selection to match the described strategy. **[NIT]** The comment above repairJSON says it will "pick the LAST" valid closing quote to maximize content, but findClosingQuote actually returns the FIRST candidate that yields valid continuation. Align the comment with the implementation or adjust the selection to match the described strategy.
|
||||
return s
|
||||
}
|
||||
|
||||
// repairJSON attempts to fix common LLM JSON issues:
|
||||
// - Unescaped double quotes inside string values
|
||||
//
|
||||
// Strategy: walk the JSON structurally. Object keys are parsed normally (LLMs
|
||||
// get those right). For string VALUES, we find all candidate closing quotes and
|
||||
// pick the LAST one that leaves valid JSON structure afterward — maximizing
|
||||
// string content, which is the correct bias for the "LLM put unescaped quotes
|
||||
// in a string value" failure mode.
|
||||
func repairJSON(s string) string {
|
||||
runes := []rune(s)
|
||||
var out strings.Builder
|
||||
out.Grow(len(s) + 64)
|
||||
|
||||
i := 0
|
||||
for i < len(runes) {
|
||||
c := runes[i]
|
||||
|
||||
if c != '"' {
|
||||
out.WriteRune(c)
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
// We hit an opening quote. Determine if this is a key or a value.
|
||||
// Keys: the standard JSON parser in LLMs gets keys right, so we parse
|
||||
// them normally (first unescaped quote closes).
|
||||
// Values: may contain unescaped quotes — use the repair heuristic.
|
||||
isValue := isValuePosition(runes, i)
|
||||
|
||||
if !isValue {
|
||||
// Parse key/simple string normally
|
||||
out.WriteRune('"')
|
||||
i++
|
||||
for i < len(runes) {
|
||||
ch := runes[i]
|
||||
if ch == '\\' && i+1 < len(runes) {
|
||||
out.WriteRune(ch)
|
||||
i++
|
||||
out.WriteRune(runes[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if ch == '"' {
|
||||
out.WriteRune('"')
|
||||
i++
|
||||
break
|
||||
}
|
||||
out.WriteRune(ch)
|
||||
i++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Value string — find the correct close using last-valid-candidate heuristic
|
||||
out.WriteRune('"')
|
||||
i++
|
||||
|
||||
closeIdx := findClosingQuote(runes, i)
|
||||
|
||||
// Write everything between open and close, escaping interior quotes
|
||||
for j := i; j < closeIdx; j++ {
|
||||
ch := runes[j]
|
||||
if ch == '\\' && j+1 < closeIdx {
|
||||
// Already-escaped sequence — pass through
|
||||
out.WriteRune(ch)
|
||||
j++
|
||||
out.WriteRune(runes[j])
|
||||
} else if ch == '"' {
|
||||
out.WriteRune('\\')
|
||||
out.WriteRune('"')
|
||||
} else {
|
||||
out.WriteRune(ch)
|
||||
}
|
||||
}
|
||||
|
||||
// Write the closing quote
|
||||
out.WriteRune('"')
|
||||
i = closeIdx + 1
|
||||
}
|
||||
|
||||
return out.String()
|
||||
}
|
||||
|
||||
// isValuePosition determines if the quote at position i is opening a JSON value
|
||||
// string (as opposed to an object key). We only apply repair to values that
|
||||
// follow ':' since those are the free-text fields where LLMs produce unescaped
|
||||
// quotes. Array elements and keys are left alone (parsed normally).
|
||||
func isValuePosition(runes []rune, i int) bool {
|
||||
// Look backward, skipping whitespace, for the preceding structural char
|
||||
j := i - 1
|
||||
for j >= 0 && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||
j--
|
||||
}
|
||||
if j < 0 {
|
||||
return false
|
||||
}
|
||||
// After ':' → definitely a value
|
||||
return runes[j] == ':'
|
||||
}
|
||||
|
||||
// findClosingQuote finds the index of the true closing quote for a JSON string
|
||||
// value starting at position start (the character after the opening quote).
|
||||
// It collects all unescaped quote candidates and returns the FIRST one that
|
||||
// produces valid JSON continuation (deeper lookahead verifies the next token).
|
||||
func findClosingQuote(runes []rune, start int) int {
|
||||
|
[MINOR] The repairJSON/findClosingQuote logic scans ahead for every candidate quote and can be O(n^2) on inputs with many quotes, potentially enabling CPU/resource exhaustion if very large or adversarial inputs are processed. Consider bounding input size for repair, short-circuiting after a threshold, or limiting the number of candidates examined. **[MINOR]** The repairJSON/findClosingQuote logic scans ahead for every candidate quote and can be O(n^2) on inputs with many quotes, potentially enabling CPU/resource exhaustion if very large or adversarial inputs are processed. Consider bounding input size for repair, short-circuiting after a threshold, or limiting the number of candidates examined.
|
||||
// Collect all candidate positions for the closing quote.
|
||||
var candidates []int
|
||||
for j := start; j < len(runes); j++ {
|
||||
if runes[j] == '\\' {
|
||||
j++ // skip escaped character
|
||||
continue
|
||||
}
|
||||
if runes[j] == '"' {
|
||||
candidates = append(candidates, j)
|
||||
}
|
||||
}
|
||||
|
||||
if len(candidates) == 0 {
|
||||
return len(runes)
|
||||
}
|
||||
|
||||
if len(candidates) == 1 {
|
||||
return candidates[0]
|
||||
}
|
||||
|
||||
// Try candidates from FIRST to LAST. The correct closing quote is the
|
||||
// earliest one that produces valid JSON structure after it (verified by
|
||||
// deeper lookahead that checks the next token is a valid JSON start).
|
||||
for _, idx := range candidates {
|
||||
if isValidJSONAfterClose(runes, idx+1) {
|
||||
return idx
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: return the last candidate
|
||||
return candidates[len(candidates)-1]
|
||||
}
|
||||
|
||||
// isValidJSONAfterClose checks whether the runes after a candidate closing quote
|
||||
// look like valid JSON continuation for a VALUE string. Since we only use this
|
||||
// for value positions, ':' is NOT a valid continuation (values are never keys).
|
||||
// Checks deeper structure to avoid being fooled by JSON-like content in strings.
|
||||
func isValidJSONAfterClose(runes []rune, pos int) bool {
|
||||
j := pos
|
||||
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||
j++
|
||||
}
|
||||
|
||||
if j >= len(runes) {
|
||||
return true
|
||||
}
|
||||
|
||||
next := runes[j]
|
||||
if next == '}' || next == ']' {
|
||||
// Closing a container. Verify what follows the close is also valid:
|
||||
// another structural char, comma, or EOF.
|
||||
return isValidAfterContainerClose(runes, j+1)
|
||||
}
|
||||
if next == ',' {
|
||||
// After comma, must be followed by a valid JSON token
|
||||
j++
|
||||
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||
j++
|
||||
}
|
||||
if j >= len(runes) {
|
||||
return false // trailing comma with nothing after — invalid
|
||||
}
|
||||
return isJSONTokenStart(runes, j)
|
||||
}
|
||||
// ':' is NOT valid here — we're in a value position, not a key.
|
||||
// Any other character is also invalid.
|
||||
return false
|
||||
}
|
||||
|
||||
// isValidAfterContainerClose checks that after a } or ], the continuation is
|
||||
// structurally valid: more closes, comma+token, or EOF.
|
||||
func isValidAfterContainerClose(runes []rune, pos int) bool {
|
||||
j := pos
|
||||
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||
j++
|
||||
}
|
||||
if j >= len(runes) {
|
||||
return true
|
||||
}
|
||||
next := runes[j]
|
||||
if next == '}' || next == ']' {
|
||||
return isValidAfterContainerClose(runes, j+1)
|
||||
}
|
||||
if next == ',' {
|
||||
j++
|
||||
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||
j++
|
||||
}
|
||||
if j >= len(runes) {
|
||||
return false
|
||||
}
|
||||
return isJSONTokenStart(runes, j)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// isJSONTokenStart returns true if the rune could begin a JSON value or key.
|
||||
// For keywords (true/false/null), verifies the full keyword is present.
|
||||
func isJSONTokenStart(runes []rune, pos int) bool {
|
||||
if pos >= len(runes) {
|
||||
return false
|
||||
}
|
||||
r := runes[pos]
|
||||
switch {
|
||||
case r == '"': // string
|
||||
return true
|
||||
case r == '{' || r == '[': // object or array
|
||||
return true
|
||||
case r == 't': // true
|
||||
return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "true"
|
||||
case r == 'f': // false
|
||||
return pos+5 <= len(runes) && string(runes[pos:pos+5]) == "false"
|
||||
case r == 'n': // null
|
||||
return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "null"
|
||||
case r >= '0' && r <= '9': // number
|
||||
return true
|
||||
case r == '-': // negative number
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package review
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -112,3 +113,112 @@ func TestParseResponse_MarkdownFencesNoLang(t *testing.T) {
|
||||
t.Errorf("expected APPROVE, got %q", result.Verdict)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseResponse_UnescapedQuotesInStrings(t *testing.T) {
|
||||
// Real failure from CI: Sonnet puts unescaped quotes like (e.g. "28") in findings
|
||||
input := `{"verdict": "APPROVE", "summary": "Clean PR", "findings": [{"severity": "NIT", "file": "ci/Dockerfile", "line": 14, "finding": "The comment says OTP_VERSION is the major version (e.g. \"28\") but it actually contains unescaped quotes like (e.g. "28") which breaks JSON"}], "recommendation": "Ship it"}`
|
||||
|
||||
result, err := ParseResponse(input)
|
||||
if err != nil {
|
||||
t.Fatalf("expected repair to handle unescaped quotes, got error: %v", err)
|
||||
}
|
||||
if result.Verdict != "APPROVE" {
|
||||
t.Errorf("expected APPROVE, got %q", result.Verdict)
|
||||
}
|
||||
if len(result.Findings) != 1 {
|
||||
t.Fatalf("expected 1 finding, got %d", len(result.Findings))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepairJSON_NoOpOnValid(t *testing.T) {
|
||||
valid := `{"key": "value", "num": 42}`
|
||||
result := repairJSON(valid)
|
||||
if result != valid {
|
||||
t.Errorf("repairJSON should not modify valid JSON\n got: %s\n want: %s", result, valid)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepairJSON_FixesUnescapedQuotes(t *testing.T) {
|
||||
// Interior quote followed by non-structural character
|
||||
input := `{"msg": "use "foo" here"}`
|
||||
result := repairJSON(input)
|
||||
|
||||
// Should be parseable now
|
||||
var m map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepairJSON_InteriorQuoteBeforeComma(t *testing.T) {
|
||||
// Bug reported by reviewer: interior quoted word immediately before a comma
|
||||
input := `{"msg": "say "yes", and go"}`
|
||||
result := repairJSON(input)
|
||||
|
||||
var m map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||
}
|
||||
// The full string content should be preserved
|
||||
msg, ok := m["msg"].(string)
|
||||
if !ok {
|
||||
t.Fatal("msg field missing or not a string")
|
||||
}
|
||||
if msg != `say "yes", and go` {
|
||||
t.Errorf("unexpected msg content: %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepairJSON_InteriorQuoteBeforeCloseBrace(t *testing.T) {
|
||||
// Bug reported by reviewer: JSON-shaped syntax inside string values
|
||||
input := `{"msg": "input map {"key": "val"} caused error"}`
|
||||
result := repairJSON(input)
|
||||
|
||||
var m map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepairJSON_MultipleFields(t *testing.T) {
|
||||
// Multiple string fields with unescaped quotes in different positions
|
||||
input := `{"a": "hello "world"", "b": "foo"}`
|
||||
result := repairJSON(input)
|
||||
|
||||
var m map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||
}
|
||||
if _, ok := m["b"]; !ok {
|
||||
t.Error("expected 'b' field to be preserved")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepairJSON_PreservesEscapedQuotes(t *testing.T) {
|
||||
// Already-escaped quotes should not be double-escaped
|
||||
input := `{"msg": "already \"escaped\" here"}`
|
||||
result := repairJSON(input)
|
||||
|
||||
if result != input {
|
||||
t.Errorf("repairJSON should not modify already-escaped quotes\n got: %s\n want: %s", result, input)
|
||||
}
|
||||
|
||||
var m map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepairJSON_ComplexNestedContent(t *testing.T) {
|
||||
// Combines both reviewer bugs: quoted words before commas AND JSON-like content
|
||||
input := `{"verdict": "APPROVE", "findings": [{"finding": "The map {"key": "val"} and (e.g. "28") and say "yes", then stop"}]}`
|
||||
result := repairJSON(input)
|
||||
|
||||
var parsed map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(result), &parsed); err != nil {
|
||||
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||
}
|
||||
if parsed["verdict"] != "APPROVE" {
|
||||
t.Errorf("expected verdict APPROVE, got %v", parsed["verdict"])
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user
[MINOR] The returned error includes the full raw LLM response. This can lead to information leakage and log amplification if upstream logs the error verbatim. Consider redacting or truncating the response (e.g., first N bytes) before including it in error messages.