fix: redesign repairJSON to handle all reviewer-reported edge cases
CI / test (pull_request) Successful in 11s
CI / review (gpt-4.1, gpt, GPT_REVIEW_TOKEN) (pull_request) Failing after 10s
CI / review (gpt-5, security, SECURITY_REVIEW.md, SECURITY_REVIEW_TOKEN) (pull_request) Failing after 10s
CI / review (gpt-5, sonnet, SONNET_REVIEW_TOKEN) (pull_request) Failing after 11s
CI / test (pull_request) Successful in 11s
CI / review (gpt-4.1, gpt, GPT_REVIEW_TOKEN) (pull_request) Failing after 10s
CI / review (gpt-5, security, SECURITY_REVIEW.md, SECURITY_REVIEW_TOKEN) (pull_request) Failing after 10s
CI / review (gpt-5, sonnet, SONNET_REVIEW_TOKEN) (pull_request) Failing after 11s
Rewrites the JSON repair algorithm to address two correctness bugs
identified in code review:
1. Interior quoted word before comma: "say "yes", and go" was
misidentified as structural close because "," followed the quote.
2. JSON-shaped content in strings: {"key": "val"} inside a string
value was being parsed as actual JSON structure.
The new approach:
- Distinguishes keys from values (only values need repair)
- Uses first-valid-candidate scan with deep lookahead
- Verifies that after a candidate close, the continuation is not just
a structural char but a complete valid JSON pattern
- Validates keyword tokens (true/false/null) fully, not just first char
- Checks container closes recursively for valid continuation
Adds comprehensive tests for all reported edge cases plus a complex
combined scenario with nested JSON-like content, quoted words before
commas, and multiple failure modes in one string.
This commit is contained in:
+198
-40
@@ -83,68 +83,226 @@ func extractJSON(s string) string {
|
|||||||
// repairJSON attempts to fix common LLM JSON issues:
|
// repairJSON attempts to fix common LLM JSON issues:
|
||||||
// - Unescaped double quotes inside string values
|
// - Unescaped double quotes inside string values
|
||||||
//
|
//
|
||||||
// Strategy: walk the JSON character by character, tracking whether we're inside
|
// Strategy: walk the JSON structurally. Object keys are parsed normally (LLMs
|
||||||
// a string value. When we find a quote that doesn't look like a structural
|
// get those right). For string VALUES, we find all candidate closing quotes and
|
||||||
// delimiter (not preceded by \, not followed by : , } ] or whitespace+structural),
|
// pick the LAST one that leaves valid JSON structure afterward — maximizing
|
||||||
// escape it.
|
// string content, which is the correct bias for the "LLM put unescaped quotes
|
||||||
|
// in a string value" failure mode.
|
||||||
func repairJSON(s string) string {
|
func repairJSON(s string) string {
|
||||||
|
runes := []rune(s)
|
||||||
var out strings.Builder
|
var out strings.Builder
|
||||||
out.Grow(len(s) + 64)
|
out.Grow(len(s) + 64)
|
||||||
|
|
||||||
inString := false
|
i := 0
|
||||||
runes := []rune(s)
|
for i < len(runes) {
|
||||||
|
|
||||||
for i := 0; i < len(runes); i++ {
|
|
||||||
c := runes[i]
|
c := runes[i]
|
||||||
|
|
||||||
if !inString {
|
if c != '"' {
|
||||||
out.WriteRune(c)
|
out.WriteRune(c)
|
||||||
if c == '"' {
|
i++
|
||||||
inString = true
|
|
||||||
}
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// We're inside a string
|
// We hit an opening quote. Determine if this is a key or a value.
|
||||||
if c == '\\' {
|
// Keys: the standard JSON parser in LLMs gets keys right, so we parse
|
||||||
// Escape sequence — pass through both characters
|
// them normally (first unescaped quote closes).
|
||||||
out.WriteRune(c)
|
// Values: may contain unescaped quotes — use the repair heuristic.
|
||||||
if i+1 < len(runes) {
|
isValue := isValuePosition(runes, i)
|
||||||
|
|
||||||
|
if !isValue {
|
||||||
|
// Parse key/simple string normally
|
||||||
|
out.WriteRune('"')
|
||||||
|
i++
|
||||||
|
for i < len(runes) {
|
||||||
|
ch := runes[i]
|
||||||
|
if ch == '\\' && i+1 < len(runes) {
|
||||||
|
out.WriteRune(ch)
|
||||||
i++
|
i++
|
||||||
out.WriteRune(runes[i])
|
out.WriteRune(runes[i])
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if ch == '"' {
|
||||||
|
out.WriteRune('"')
|
||||||
|
i++
|
||||||
|
break
|
||||||
|
}
|
||||||
|
out.WriteRune(ch)
|
||||||
|
i++
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if c == '"' {
|
// Value string — find the correct close using last-valid-candidate heuristic
|
||||||
// Is this the end of the string, or an unescaped interior quote?
|
out.WriteRune('"')
|
||||||
// Look ahead: skip whitespace, then check for structural character
|
i++
|
||||||
j := i + 1
|
|
||||||
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
closeIdx := findClosingQuote(runes, i)
|
||||||
|
|
||||||
|
// Write everything between open and close, escaping interior quotes
|
||||||
|
for j := i; j < closeIdx; j++ {
|
||||||
|
ch := runes[j]
|
||||||
|
if ch == '\\' && j+1 < closeIdx {
|
||||||
|
// Already-escaped sequence — pass through
|
||||||
|
out.WriteRune(ch)
|
||||||
j++
|
j++
|
||||||
}
|
out.WriteRune(runes[j])
|
||||||
if j < len(runes) {
|
} else if ch == '"' {
|
||||||
next := runes[j]
|
|
||||||
if next == ',' || next == '}' || next == ']' || next == ':' {
|
|
||||||
// Structural — this is really the end of the string
|
|
||||||
out.WriteRune(c)
|
|
||||||
inString = false
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// End of input — must be closing quote
|
|
||||||
out.WriteRune(c)
|
|
||||||
inString = false
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Not a structural close — this is an unescaped interior quote
|
|
||||||
out.WriteRune('\\')
|
out.WriteRune('\\')
|
||||||
out.WriteRune('"')
|
out.WriteRune('"')
|
||||||
continue
|
} else {
|
||||||
|
out.WriteRune(ch)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
out.WriteRune(c)
|
// Write the closing quote
|
||||||
|
out.WriteRune('"')
|
||||||
|
i = closeIdx + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
return out.String()
|
return out.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isValuePosition determines if the quote at position i is opening a JSON value
|
||||||
|
// string (as opposed to an object key). We only apply repair to values that
|
||||||
|
// follow ':' since those are the free-text fields where LLMs produce unescaped
|
||||||
|
// quotes. Array elements and keys are left alone (parsed normally).
|
||||||
|
func isValuePosition(runes []rune, i int) bool {
|
||||||
|
// Look backward, skipping whitespace, for the preceding structural char
|
||||||
|
j := i - 1
|
||||||
|
for j >= 0 && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||||
|
j--
|
||||||
|
}
|
||||||
|
if j < 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// After ':' → definitely a value
|
||||||
|
return runes[j] == ':'
|
||||||
|
}
|
||||||
|
|
||||||
|
// findClosingQuote finds the index of the true closing quote for a JSON string
|
||||||
|
// value starting at position start (the character after the opening quote).
|
||||||
|
// It collects all unescaped quote candidates and returns the FIRST one that
|
||||||
|
// produces valid JSON continuation (deeper lookahead verifies the next token).
|
||||||
|
func findClosingQuote(runes []rune, start int) int {
|
||||||
|
// Collect all candidate positions for the closing quote.
|
||||||
|
var candidates []int
|
||||||
|
for j := start; j < len(runes); j++ {
|
||||||
|
if runes[j] == '\\' {
|
||||||
|
j++ // skip escaped character
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if runes[j] == '"' {
|
||||||
|
candidates = append(candidates, j)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(candidates) == 0 {
|
||||||
|
return len(runes)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(candidates) == 1 {
|
||||||
|
return candidates[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try candidates from FIRST to LAST. The correct closing quote is the
|
||||||
|
// earliest one that produces valid JSON structure after it (verified by
|
||||||
|
// deeper lookahead that checks the next token is a valid JSON start).
|
||||||
|
for _, idx := range candidates {
|
||||||
|
if isValidJSONAfterClose(runes, idx+1) {
|
||||||
|
return idx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: return the last candidate
|
||||||
|
return candidates[len(candidates)-1]
|
||||||
|
}
|
||||||
|
|
||||||
|
// isValidJSONAfterClose checks whether the runes after a candidate closing quote
|
||||||
|
// look like valid JSON continuation for a VALUE string. Since we only use this
|
||||||
|
// for value positions, ':' is NOT a valid continuation (values are never keys).
|
||||||
|
// Checks deeper structure to avoid being fooled by JSON-like content in strings.
|
||||||
|
func isValidJSONAfterClose(runes []rune, pos int) bool {
|
||||||
|
j := pos
|
||||||
|
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||||
|
j++
|
||||||
|
}
|
||||||
|
|
||||||
|
if j >= len(runes) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
next := runes[j]
|
||||||
|
if next == '}' || next == ']' {
|
||||||
|
// Closing a container. Verify what follows the close is also valid:
|
||||||
|
// another structural char, comma, or EOF.
|
||||||
|
return isValidAfterContainerClose(runes, j+1)
|
||||||
|
}
|
||||||
|
if next == ',' {
|
||||||
|
// After comma, must be followed by a valid JSON token
|
||||||
|
j++
|
||||||
|
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||||
|
j++
|
||||||
|
}
|
||||||
|
if j >= len(runes) {
|
||||||
|
return false // trailing comma with nothing after — invalid
|
||||||
|
}
|
||||||
|
return isJSONTokenStart(runes, j)
|
||||||
|
}
|
||||||
|
// ':' is NOT valid here — we're in a value position, not a key.
|
||||||
|
// Any other character is also invalid.
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// isValidAfterContainerClose checks that after a } or ], the continuation is
|
||||||
|
// structurally valid: more closes, comma+token, or EOF.
|
||||||
|
func isValidAfterContainerClose(runes []rune, pos int) bool {
|
||||||
|
j := pos
|
||||||
|
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||||
|
j++
|
||||||
|
}
|
||||||
|
if j >= len(runes) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
next := runes[j]
|
||||||
|
if next == '}' || next == ']' {
|
||||||
|
return isValidAfterContainerClose(runes, j+1)
|
||||||
|
}
|
||||||
|
if next == ',' {
|
||||||
|
j++
|
||||||
|
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
|
||||||
|
j++
|
||||||
|
}
|
||||||
|
if j >= len(runes) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return isJSONTokenStart(runes, j)
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// isJSONTokenStart returns true if the rune could begin a JSON value or key.
|
||||||
|
// For keywords (true/false/null), verifies the full keyword is present.
|
||||||
|
func isJSONTokenStart(runes []rune, pos int) bool {
|
||||||
|
if pos >= len(runes) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
r := runes[pos]
|
||||||
|
switch {
|
||||||
|
case r == '"': // string
|
||||||
|
return true
|
||||||
|
case r == '{' || r == '[': // object or array
|
||||||
|
return true
|
||||||
|
case r == 't': // true
|
||||||
|
return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "true"
|
||||||
|
case r == 'f': // false
|
||||||
|
return pos+5 <= len(runes) && string(runes[pos:pos+5]) == "false"
|
||||||
|
case r == 'n': // null
|
||||||
|
return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "null"
|
||||||
|
case r >= '0' && r <= '9': // number
|
||||||
|
return true
|
||||||
|
case r == '-': // negative number
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|||||||
@@ -149,3 +149,76 @@ func TestRepairJSON_FixesUnescapedQuotes(t *testing.T) {
|
|||||||
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRepairJSON_InteriorQuoteBeforeComma(t *testing.T) {
|
||||||
|
// Bug reported by reviewer: interior quoted word immediately before a comma
|
||||||
|
input := `{"msg": "say "yes", and go"}`
|
||||||
|
result := repairJSON(input)
|
||||||
|
|
||||||
|
var m map[string]interface{}
|
||||||
|
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||||
|
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||||
|
}
|
||||||
|
// The full string content should be preserved
|
||||||
|
msg, ok := m["msg"].(string)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("msg field missing or not a string")
|
||||||
|
}
|
||||||
|
if msg != `say "yes", and go` {
|
||||||
|
t.Errorf("unexpected msg content: %q", msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRepairJSON_InteriorQuoteBeforeCloseBrace(t *testing.T) {
|
||||||
|
// Bug reported by reviewer: JSON-shaped syntax inside string values
|
||||||
|
input := `{"msg": "input map {"key": "val"} caused error"}`
|
||||||
|
result := repairJSON(input)
|
||||||
|
|
||||||
|
var m map[string]interface{}
|
||||||
|
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||||
|
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRepairJSON_MultipleFields(t *testing.T) {
|
||||||
|
// Multiple string fields with unescaped quotes in different positions
|
||||||
|
input := `{"a": "hello "world"", "b": "foo"}`
|
||||||
|
result := repairJSON(input)
|
||||||
|
|
||||||
|
var m map[string]interface{}
|
||||||
|
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||||
|
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||||
|
}
|
||||||
|
if _, ok := m["b"]; !ok {
|
||||||
|
t.Error("expected 'b' field to be preserved")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRepairJSON_PreservesEscapedQuotes(t *testing.T) {
|
||||||
|
// Already-escaped quotes should not be double-escaped
|
||||||
|
input := `{"msg": "already \"escaped\" here"}`
|
||||||
|
result := repairJSON(input)
|
||||||
|
|
||||||
|
if result != input {
|
||||||
|
t.Errorf("repairJSON should not modify already-escaped quotes\n got: %s\n want: %s", result, input)
|
||||||
|
}
|
||||||
|
|
||||||
|
var m map[string]interface{}
|
||||||
|
if err := json.Unmarshal([]byte(result), &m); err != nil {
|
||||||
|
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRepairJSON_ComplexNestedContent(t *testing.T) {
|
||||||
|
// Combines both reviewer bugs: quoted words before commas AND JSON-like content
|
||||||
|
input := `{"verdict": "APPROVE", "findings": [{"finding": "The map {"key": "val"} and (e.g. "28") and say "yes", then stop"}]}`
|
||||||
|
result := repairJSON(input)
|
||||||
|
|
||||||
|
var parsed map[string]interface{}
|
||||||
|
if err := json.Unmarshal([]byte(result), &parsed); err != nil {
|
||||||
|
t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result)
|
||||||
|
}
|
||||||
|
if parsed["verdict"] != "APPROVE" {
|
||||||
|
t.Errorf("expected verdict APPROVE, got %v", parsed["verdict"])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user