From 80a9a7675bc0b51d7e5c619ba9b7ccd88cc24f9a Mon Sep 17 00:00:00 2001 From: Rodin Date: Sun, 3 May 2026 09:47:22 -0700 Subject: [PATCH 1/3] fix: repair unescaped quotes in LLM JSON responses LLMs (especially Sonnet) sometimes emit JSON with unescaped double quotes inside string values, e.g. (e.g. "28") instead of properly escaping them. This caused parse failures in CI. Add a repairJSON fallback that uses a character-by-character scanner to identify interior quotes (those not followed by structural JSON characters) and escape them before retrying the parse. Fixes sonnet-review failures on gargoyle PR #551. --- review/parser.go | 76 ++++++++++++++++++++++++++++++++++++++++++- review/parser_test.go | 37 +++++++++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/review/parser.go b/review/parser.go index dc8eb73..a615e86 100644 --- a/review/parser.go +++ b/review/parser.go @@ -29,7 +29,12 @@ func ParseResponse(response string) (*ReviewResult, error) { var result ReviewResult if err := json.Unmarshal([]byte(cleaned), &result); err != nil { - return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response) + // LLMs sometimes produce JSON with unescaped quotes inside string values. + // Try to repair before giving up. + repaired := repairJSON(cleaned) + if err2 := json.Unmarshal([]byte(repaired), &result); err2 != nil { + return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response) + } } // Validate verdict @@ -74,3 +79,72 @@ func extractJSON(s string) string { s = strings.TrimSpace(s) return s } + +// repairJSON attempts to fix common LLM JSON issues: +// - Unescaped double quotes inside string values +// +// Strategy: walk the JSON character by character, tracking whether we're inside +// a string value. When we find a quote that doesn't look like a structural +// delimiter (not preceded by \, not followed by : , } ] or whitespace+structural), +// escape it. +func repairJSON(s string) string { + var out strings.Builder + out.Grow(len(s) + 64) + + inString := false + runes := []rune(s) + + for i := 0; i < len(runes); i++ { + c := runes[i] + + if !inString { + out.WriteRune(c) + if c == '"' { + inString = true + } + continue + } + + // We're inside a string + if c == '\\' { + // Escape sequence — pass through both characters + out.WriteRune(c) + if i+1 < len(runes) { + i++ + out.WriteRune(runes[i]) + } + continue + } + + if c == '"' { + // Is this the end of the string, or an unescaped interior quote? + // Look ahead: skip whitespace, then check for structural character + j := i + 1 + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j < len(runes) { + next := runes[j] + if next == ',' || next == '}' || next == ']' || next == ':' { + // Structural — this is really the end of the string + out.WriteRune(c) + inString = false + continue + } + } else { + // End of input — must be closing quote + out.WriteRune(c) + inString = false + continue + } + // Not a structural close — this is an unescaped interior quote + out.WriteRune('\\') + out.WriteRune('"') + continue + } + + out.WriteRune(c) + } + + return out.String() +} diff --git a/review/parser_test.go b/review/parser_test.go index 138a866..ae8c071 100644 --- a/review/parser_test.go +++ b/review/parser_test.go @@ -1,6 +1,7 @@ package review import ( + "encoding/json" "testing" ) @@ -112,3 +113,39 @@ func TestParseResponse_MarkdownFencesNoLang(t *testing.T) { t.Errorf("expected APPROVE, got %q", result.Verdict) } } + +func TestParseResponse_UnescapedQuotesInStrings(t *testing.T) { + // Real failure from CI: Sonnet puts unescaped quotes like (e.g. "28") in findings + input := `{"verdict": "APPROVE", "summary": "Clean PR", "findings": [{"severity": "NIT", "file": "ci/Dockerfile", "line": 14, "finding": "The comment says OTP_VERSION is the major version (e.g. \"28\") but it actually contains unescaped quotes like (e.g. "28") which breaks JSON"}], "recommendation": "Ship it"}` + + result, err := ParseResponse(input) + if err != nil { + t.Fatalf("expected repair to handle unescaped quotes, got error: %v", err) + } + if result.Verdict != "APPROVE" { + t.Errorf("expected APPROVE, got %q", result.Verdict) + } + if len(result.Findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(result.Findings)) + } +} + +func TestRepairJSON_NoOpOnValid(t *testing.T) { + valid := `{"key": "value", "num": 42}` + result := repairJSON(valid) + if result != valid { + t.Errorf("repairJSON should not modify valid JSON\n got: %s\n want: %s", result, valid) + } +} + +func TestRepairJSON_FixesUnescapedQuotes(t *testing.T) { + // Interior quote followed by non-structural character + input := `{"msg": "use "foo" here"}` + result := repairJSON(input) + + // Should be parseable now + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} -- 2.47.3 From 25d1a670bfa379d4ecff854c6387ff491539dad3 Mon Sep 17 00:00:00 2001 From: Rodin Date: Mon, 4 May 2026 21:27:39 -0700 Subject: [PATCH 2/3] fix: redesign repairJSON to handle all reviewer-reported edge cases Rewrites the JSON repair algorithm to address two correctness bugs identified in code review: 1. Interior quoted word before comma: "say "yes", and go" was misidentified as structural close because "," followed the quote. 2. JSON-shaped content in strings: {"key": "val"} inside a string value was being parsed as actual JSON structure. The new approach: - Distinguishes keys from values (only values need repair) - Uses first-valid-candidate scan with deep lookahead - Verifies that after a candidate close, the continuation is not just a structural char but a complete valid JSON pattern - Validates keyword tokens (true/false/null) fully, not just first char - Checks container closes recursively for valid continuation Adds comprehensive tests for all reported edge cases plus a complex combined scenario with nested JSON-like content, quoted words before commas, and multiple failure modes in one string. --- review/parser.go | 246 ++++++++++++++++++++++++++++++++++-------- review/parser_test.go | 73 +++++++++++++ 2 files changed, 275 insertions(+), 44 deletions(-) diff --git a/review/parser.go b/review/parser.go index a615e86..57c0b24 100644 --- a/review/parser.go +++ b/review/parser.go @@ -83,68 +83,226 @@ func extractJSON(s string) string { // repairJSON attempts to fix common LLM JSON issues: // - Unescaped double quotes inside string values // -// Strategy: walk the JSON character by character, tracking whether we're inside -// a string value. When we find a quote that doesn't look like a structural -// delimiter (not preceded by \, not followed by : , } ] or whitespace+structural), -// escape it. +// Strategy: walk the JSON structurally. Object keys are parsed normally (LLMs +// get those right). For string VALUES, we find all candidate closing quotes and +// pick the LAST one that leaves valid JSON structure afterward — maximizing +// string content, which is the correct bias for the "LLM put unescaped quotes +// in a string value" failure mode. func repairJSON(s string) string { + runes := []rune(s) var out strings.Builder out.Grow(len(s) + 64) - inString := false - runes := []rune(s) - - for i := 0; i < len(runes); i++ { + i := 0 + for i < len(runes) { c := runes[i] - if !inString { + if c != '"' { out.WriteRune(c) - if c == '"' { - inString = true - } + i++ continue } - // We're inside a string - if c == '\\' { - // Escape sequence — pass through both characters - out.WriteRune(c) - if i+1 < len(runes) { - i++ - out.WriteRune(runes[i]) - } - continue - } + // We hit an opening quote. Determine if this is a key or a value. + // Keys: the standard JSON parser in LLMs gets keys right, so we parse + // them normally (first unescaped quote closes). + // Values: may contain unescaped quotes — use the repair heuristic. + isValue := isValuePosition(runes, i) - if c == '"' { - // Is this the end of the string, or an unescaped interior quote? - // Look ahead: skip whitespace, then check for structural character - j := i + 1 - for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { - j++ - } - if j < len(runes) { - next := runes[j] - if next == ',' || next == '}' || next == ']' || next == ':' { - // Structural — this is really the end of the string - out.WriteRune(c) - inString = false + if !isValue { + // Parse key/simple string normally + out.WriteRune('"') + i++ + for i < len(runes) { + ch := runes[i] + if ch == '\\' && i+1 < len(runes) { + out.WriteRune(ch) + i++ + out.WriteRune(runes[i]) + i++ continue } - } else { - // End of input — must be closing quote - out.WriteRune(c) - inString = false - continue + if ch == '"' { + out.WriteRune('"') + i++ + break + } + out.WriteRune(ch) + i++ } - // Not a structural close — this is an unescaped interior quote - out.WriteRune('\\') - out.WriteRune('"') continue } - out.WriteRune(c) + // Value string — find the correct close using last-valid-candidate heuristic + out.WriteRune('"') + i++ + + closeIdx := findClosingQuote(runes, i) + + // Write everything between open and close, escaping interior quotes + for j := i; j < closeIdx; j++ { + ch := runes[j] + if ch == '\\' && j+1 < closeIdx { + // Already-escaped sequence — pass through + out.WriteRune(ch) + j++ + out.WriteRune(runes[j]) + } else if ch == '"' { + out.WriteRune('\\') + out.WriteRune('"') + } else { + out.WriteRune(ch) + } + } + + // Write the closing quote + out.WriteRune('"') + i = closeIdx + 1 } return out.String() } + +// isValuePosition determines if the quote at position i is opening a JSON value +// string (as opposed to an object key). We only apply repair to values that +// follow ':' since those are the free-text fields where LLMs produce unescaped +// quotes. Array elements and keys are left alone (parsed normally). +func isValuePosition(runes []rune, i int) bool { + // Look backward, skipping whitespace, for the preceding structural char + j := i - 1 + for j >= 0 && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j-- + } + if j < 0 { + return false + } + // After ':' → definitely a value + return runes[j] == ':' +} + +// findClosingQuote finds the index of the true closing quote for a JSON string +// value starting at position start (the character after the opening quote). +// It collects all unescaped quote candidates and returns the FIRST one that +// produces valid JSON continuation (deeper lookahead verifies the next token). +func findClosingQuote(runes []rune, start int) int { + // Collect all candidate positions for the closing quote. + var candidates []int + for j := start; j < len(runes); j++ { + if runes[j] == '\\' { + j++ // skip escaped character + continue + } + if runes[j] == '"' { + candidates = append(candidates, j) + } + } + + if len(candidates) == 0 { + return len(runes) + } + + if len(candidates) == 1 { + return candidates[0] + } + + // Try candidates from FIRST to LAST. The correct closing quote is the + // earliest one that produces valid JSON structure after it (verified by + // deeper lookahead that checks the next token is a valid JSON start). + for _, idx := range candidates { + if isValidJSONAfterClose(runes, idx+1) { + return idx + } + } + + // Fallback: return the last candidate + return candidates[len(candidates)-1] +} + +// isValidJSONAfterClose checks whether the runes after a candidate closing quote +// look like valid JSON continuation for a VALUE string. Since we only use this +// for value positions, ':' is NOT a valid continuation (values are never keys). +// Checks deeper structure to avoid being fooled by JSON-like content in strings. +func isValidJSONAfterClose(runes []rune, pos int) bool { + j := pos + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + + if j >= len(runes) { + return true + } + + next := runes[j] + if next == '}' || next == ']' { + // Closing a container. Verify what follows the close is also valid: + // another structural char, comma, or EOF. + return isValidAfterContainerClose(runes, j+1) + } + if next == ',' { + // After comma, must be followed by a valid JSON token + j++ + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return false // trailing comma with nothing after — invalid + } + return isJSONTokenStart(runes, j) + } + // ':' is NOT valid here — we're in a value position, not a key. + // Any other character is also invalid. + return false +} + +// isValidAfterContainerClose checks that after a } or ], the continuation is +// structurally valid: more closes, comma+token, or EOF. +func isValidAfterContainerClose(runes []rune, pos int) bool { + j := pos + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return true + } + next := runes[j] + if next == '}' || next == ']' { + return isValidAfterContainerClose(runes, j+1) + } + if next == ',' { + j++ + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return false + } + return isJSONTokenStart(runes, j) + } + return false +} + +// isJSONTokenStart returns true if the rune could begin a JSON value or key. +// For keywords (true/false/null), verifies the full keyword is present. +func isJSONTokenStart(runes []rune, pos int) bool { + if pos >= len(runes) { + return false + } + r := runes[pos] + switch { + case r == '"': // string + return true + case r == '{' || r == '[': // object or array + return true + case r == 't': // true + return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "true" + case r == 'f': // false + return pos+5 <= len(runes) && string(runes[pos:pos+5]) == "false" + case r == 'n': // null + return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "null" + case r >= '0' && r <= '9': // number + return true + case r == '-': // negative number + return true + } + return false +} diff --git a/review/parser_test.go b/review/parser_test.go index ae8c071..56c6508 100644 --- a/review/parser_test.go +++ b/review/parser_test.go @@ -149,3 +149,76 @@ func TestRepairJSON_FixesUnescapedQuotes(t *testing.T) { t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) } } + +func TestRepairJSON_InteriorQuoteBeforeComma(t *testing.T) { + // Bug reported by reviewer: interior quoted word immediately before a comma + input := `{"msg": "say "yes", and go"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + // The full string content should be preserved + msg, ok := m["msg"].(string) + if !ok { + t.Fatal("msg field missing or not a string") + } + if msg != `say "yes", and go` { + t.Errorf("unexpected msg content: %q", msg) + } +} + +func TestRepairJSON_InteriorQuoteBeforeCloseBrace(t *testing.T) { + // Bug reported by reviewer: JSON-shaped syntax inside string values + input := `{"msg": "input map {"key": "val"} caused error"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} + +func TestRepairJSON_MultipleFields(t *testing.T) { + // Multiple string fields with unescaped quotes in different positions + input := `{"a": "hello "world"", "b": "foo"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + if _, ok := m["b"]; !ok { + t.Error("expected 'b' field to be preserved") + } +} + +func TestRepairJSON_PreservesEscapedQuotes(t *testing.T) { + // Already-escaped quotes should not be double-escaped + input := `{"msg": "already \"escaped\" here"}` + result := repairJSON(input) + + if result != input { + t.Errorf("repairJSON should not modify already-escaped quotes\n got: %s\n want: %s", result, input) + } + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} + +func TestRepairJSON_ComplexNestedContent(t *testing.T) { + // Combines both reviewer bugs: quoted words before commas AND JSON-like content + input := `{"verdict": "APPROVE", "findings": [{"finding": "The map {"key": "val"} and (e.g. "28") and say "yes", then stop"}]}` + result := repairJSON(input) + + var parsed map[string]interface{} + if err := json.Unmarshal([]byte(result), &parsed); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + if parsed["verdict"] != "APPROVE" { + t.Errorf("expected verdict APPROVE, got %v", parsed["verdict"]) + } +} -- 2.47.3 From 489457c1847525d9e0d38daf13d670129f5ae3b4 Mon Sep 17 00:00:00 2001 From: Rodin Date: Mon, 4 May 2026 23:15:08 -0700 Subject: [PATCH 3/3] ci: retrigger after LLM_BASE_URL secret fix -- 2.47.3