From cabbb5a55ae073aadc1e65b7adc647eecc6bdd78 Mon Sep 17 00:00:00 2001 From: Rodin <4+rodin@noreply.gitea.weiker.me> Date: Tue, 5 May 2026 12:40:39 +0000 Subject: [PATCH] fix: repair unescaped quotes in LLM JSON responses (#45) fix: repair unescaped quotes in LLM JSON responses Add repairJSON fallback that handles unescaped quotes in LLM string values using first-valid-candidate heuristic with structural lookahead. Reviewed-by: sonnet-review-bot Reviewed-by: gpt-review-bot Reviewed-by: security-review-bot --- review/parser.go | 234 +++++++++++++++++++++++++++++++++++++++++- review/parser_test.go | 110 ++++++++++++++++++++ 2 files changed, 343 insertions(+), 1 deletion(-) diff --git a/review/parser.go b/review/parser.go index dc8eb73..57c0b24 100644 --- a/review/parser.go +++ b/review/parser.go @@ -29,7 +29,12 @@ func ParseResponse(response string) (*ReviewResult, error) { var result ReviewResult if err := json.Unmarshal([]byte(cleaned), &result); err != nil { - return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response) + // LLMs sometimes produce JSON with unescaped quotes inside string values. + // Try to repair before giving up. + repaired := repairJSON(cleaned) + if err2 := json.Unmarshal([]byte(repaired), &result); err2 != nil { + return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response) + } } // Validate verdict @@ -74,3 +79,230 @@ func extractJSON(s string) string { s = strings.TrimSpace(s) return s } + +// repairJSON attempts to fix common LLM JSON issues: +// - Unescaped double quotes inside string values +// +// Strategy: walk the JSON structurally. Object keys are parsed normally (LLMs +// get those right). For string VALUES, we find all candidate closing quotes and +// pick the LAST one that leaves valid JSON structure afterward — maximizing +// string content, which is the correct bias for the "LLM put unescaped quotes +// in a string value" failure mode. +func repairJSON(s string) string { + runes := []rune(s) + var out strings.Builder + out.Grow(len(s) + 64) + + i := 0 + for i < len(runes) { + c := runes[i] + + if c != '"' { + out.WriteRune(c) + i++ + continue + } + + // We hit an opening quote. Determine if this is a key or a value. + // Keys: the standard JSON parser in LLMs gets keys right, so we parse + // them normally (first unescaped quote closes). + // Values: may contain unescaped quotes — use the repair heuristic. + isValue := isValuePosition(runes, i) + + if !isValue { + // Parse key/simple string normally + out.WriteRune('"') + i++ + for i < len(runes) { + ch := runes[i] + if ch == '\\' && i+1 < len(runes) { + out.WriteRune(ch) + i++ + out.WriteRune(runes[i]) + i++ + continue + } + if ch == '"' { + out.WriteRune('"') + i++ + break + } + out.WriteRune(ch) + i++ + } + continue + } + + // Value string — find the correct close using last-valid-candidate heuristic + out.WriteRune('"') + i++ + + closeIdx := findClosingQuote(runes, i) + + // Write everything between open and close, escaping interior quotes + for j := i; j < closeIdx; j++ { + ch := runes[j] + if ch == '\\' && j+1 < closeIdx { + // Already-escaped sequence — pass through + out.WriteRune(ch) + j++ + out.WriteRune(runes[j]) + } else if ch == '"' { + out.WriteRune('\\') + out.WriteRune('"') + } else { + out.WriteRune(ch) + } + } + + // Write the closing quote + out.WriteRune('"') + i = closeIdx + 1 + } + + return out.String() +} + +// isValuePosition determines if the quote at position i is opening a JSON value +// string (as opposed to an object key). We only apply repair to values that +// follow ':' since those are the free-text fields where LLMs produce unescaped +// quotes. Array elements and keys are left alone (parsed normally). +func isValuePosition(runes []rune, i int) bool { + // Look backward, skipping whitespace, for the preceding structural char + j := i - 1 + for j >= 0 && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j-- + } + if j < 0 { + return false + } + // After ':' → definitely a value + return runes[j] == ':' +} + +// findClosingQuote finds the index of the true closing quote for a JSON string +// value starting at position start (the character after the opening quote). +// It collects all unescaped quote candidates and returns the FIRST one that +// produces valid JSON continuation (deeper lookahead verifies the next token). +func findClosingQuote(runes []rune, start int) int { + // Collect all candidate positions for the closing quote. + var candidates []int + for j := start; j < len(runes); j++ { + if runes[j] == '\\' { + j++ // skip escaped character + continue + } + if runes[j] == '"' { + candidates = append(candidates, j) + } + } + + if len(candidates) == 0 { + return len(runes) + } + + if len(candidates) == 1 { + return candidates[0] + } + + // Try candidates from FIRST to LAST. The correct closing quote is the + // earliest one that produces valid JSON structure after it (verified by + // deeper lookahead that checks the next token is a valid JSON start). + for _, idx := range candidates { + if isValidJSONAfterClose(runes, idx+1) { + return idx + } + } + + // Fallback: return the last candidate + return candidates[len(candidates)-1] +} + +// isValidJSONAfterClose checks whether the runes after a candidate closing quote +// look like valid JSON continuation for a VALUE string. Since we only use this +// for value positions, ':' is NOT a valid continuation (values are never keys). +// Checks deeper structure to avoid being fooled by JSON-like content in strings. +func isValidJSONAfterClose(runes []rune, pos int) bool { + j := pos + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + + if j >= len(runes) { + return true + } + + next := runes[j] + if next == '}' || next == ']' { + // Closing a container. Verify what follows the close is also valid: + // another structural char, comma, or EOF. + return isValidAfterContainerClose(runes, j+1) + } + if next == ',' { + // After comma, must be followed by a valid JSON token + j++ + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return false // trailing comma with nothing after — invalid + } + return isJSONTokenStart(runes, j) + } + // ':' is NOT valid here — we're in a value position, not a key. + // Any other character is also invalid. + return false +} + +// isValidAfterContainerClose checks that after a } or ], the continuation is +// structurally valid: more closes, comma+token, or EOF. +func isValidAfterContainerClose(runes []rune, pos int) bool { + j := pos + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return true + } + next := runes[j] + if next == '}' || next == ']' { + return isValidAfterContainerClose(runes, j+1) + } + if next == ',' { + j++ + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return false + } + return isJSONTokenStart(runes, j) + } + return false +} + +// isJSONTokenStart returns true if the rune could begin a JSON value or key. +// For keywords (true/false/null), verifies the full keyword is present. +func isJSONTokenStart(runes []rune, pos int) bool { + if pos >= len(runes) { + return false + } + r := runes[pos] + switch { + case r == '"': // string + return true + case r == '{' || r == '[': // object or array + return true + case r == 't': // true + return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "true" + case r == 'f': // false + return pos+5 <= len(runes) && string(runes[pos:pos+5]) == "false" + case r == 'n': // null + return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "null" + case r >= '0' && r <= '9': // number + return true + case r == '-': // negative number + return true + } + return false +} diff --git a/review/parser_test.go b/review/parser_test.go index 138a866..56c6508 100644 --- a/review/parser_test.go +++ b/review/parser_test.go @@ -1,6 +1,7 @@ package review import ( + "encoding/json" "testing" ) @@ -112,3 +113,112 @@ func TestParseResponse_MarkdownFencesNoLang(t *testing.T) { t.Errorf("expected APPROVE, got %q", result.Verdict) } } + +func TestParseResponse_UnescapedQuotesInStrings(t *testing.T) { + // Real failure from CI: Sonnet puts unescaped quotes like (e.g. "28") in findings + input := `{"verdict": "APPROVE", "summary": "Clean PR", "findings": [{"severity": "NIT", "file": "ci/Dockerfile", "line": 14, "finding": "The comment says OTP_VERSION is the major version (e.g. \"28\") but it actually contains unescaped quotes like (e.g. "28") which breaks JSON"}], "recommendation": "Ship it"}` + + result, err := ParseResponse(input) + if err != nil { + t.Fatalf("expected repair to handle unescaped quotes, got error: %v", err) + } + if result.Verdict != "APPROVE" { + t.Errorf("expected APPROVE, got %q", result.Verdict) + } + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(result.Findings)) + } +} + +func TestRepairJSON_NoOpOnValid(t *testing.T) { + valid := `{"key": "value", "num": 42}` + result := repairJSON(valid) + if result != valid { + t.Errorf("repairJSON should not modify valid JSON\n got: %s\n want: %s", result, valid) + } +} + +func TestRepairJSON_FixesUnescapedQuotes(t *testing.T) { + // Interior quote followed by non-structural character + input := `{"msg": "use "foo" here"}` + result := repairJSON(input) + + // Should be parseable now + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} + +func TestRepairJSON_InteriorQuoteBeforeComma(t *testing.T) { + // Bug reported by reviewer: interior quoted word immediately before a comma + input := `{"msg": "say "yes", and go"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + // The full string content should be preserved + msg, ok := m["msg"].(string) + if !ok { + t.Fatal("msg field missing or not a string") + } + if msg != `say "yes", and go` { + t.Errorf("unexpected msg content: %q", msg) + } +} + +func TestRepairJSON_InteriorQuoteBeforeCloseBrace(t *testing.T) { + // Bug reported by reviewer: JSON-shaped syntax inside string values + input := `{"msg": "input map {"key": "val"} caused error"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} + +func TestRepairJSON_MultipleFields(t *testing.T) { + // Multiple string fields with unescaped quotes in different positions + input := `{"a": "hello "world"", "b": "foo"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + if _, ok := m["b"]; !ok { + t.Error("expected 'b' field to be preserved") + } +} + +func TestRepairJSON_PreservesEscapedQuotes(t *testing.T) { + // Already-escaped quotes should not be double-escaped + input := `{"msg": "already \"escaped\" here"}` + result := repairJSON(input) + + if result != input { + t.Errorf("repairJSON should not modify already-escaped quotes\n got: %s\n want: %s", result, input) + } + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} + +func TestRepairJSON_ComplexNestedContent(t *testing.T) { + // Combines both reviewer bugs: quoted words before commas AND JSON-like content + input := `{"verdict": "APPROVE", "findings": [{"finding": "The map {"key": "val"} and (e.g. "28") and say "yes", then stop"}]}` + result := repairJSON(input) + + var parsed map[string]interface{} + if err := json.Unmarshal([]byte(result), &parsed); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + if parsed["verdict"] != "APPROVE" { + t.Errorf("expected verdict APPROVE, got %v", parsed["verdict"]) + } +}