diff --git a/review/parser.go b/review/parser.go index a615e86..57c0b24 100644 --- a/review/parser.go +++ b/review/parser.go @@ -83,68 +83,226 @@ func extractJSON(s string) string { // repairJSON attempts to fix common LLM JSON issues: // - Unescaped double quotes inside string values // -// Strategy: walk the JSON character by character, tracking whether we're inside -// a string value. When we find a quote that doesn't look like a structural -// delimiter (not preceded by \, not followed by : , } ] or whitespace+structural), -// escape it. +// Strategy: walk the JSON structurally. Object keys are parsed normally (LLMs +// get those right). For string VALUES, we find all candidate closing quotes and +// pick the LAST one that leaves valid JSON structure afterward — maximizing +// string content, which is the correct bias for the "LLM put unescaped quotes +// in a string value" failure mode. func repairJSON(s string) string { + runes := []rune(s) var out strings.Builder out.Grow(len(s) + 64) - inString := false - runes := []rune(s) - - for i := 0; i < len(runes); i++ { + i := 0 + for i < len(runes) { c := runes[i] - if !inString { + if c != '"' { out.WriteRune(c) - if c == '"' { - inString = true - } + i++ continue } - // We're inside a string - if c == '\\' { - // Escape sequence — pass through both characters - out.WriteRune(c) - if i+1 < len(runes) { - i++ - out.WriteRune(runes[i]) - } - continue - } + // We hit an opening quote. Determine if this is a key or a value. + // Keys: the standard JSON parser in LLMs gets keys right, so we parse + // them normally (first unescaped quote closes). + // Values: may contain unescaped quotes — use the repair heuristic. + isValue := isValuePosition(runes, i) - if c == '"' { - // Is this the end of the string, or an unescaped interior quote? - // Look ahead: skip whitespace, then check for structural character - j := i + 1 - for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { - j++ - } - if j < len(runes) { - next := runes[j] - if next == ',' || next == '}' || next == ']' || next == ':' { - // Structural — this is really the end of the string - out.WriteRune(c) - inString = false + if !isValue { + // Parse key/simple string normally + out.WriteRune('"') + i++ + for i < len(runes) { + ch := runes[i] + if ch == '\\' && i+1 < len(runes) { + out.WriteRune(ch) + i++ + out.WriteRune(runes[i]) + i++ continue } - } else { - // End of input — must be closing quote - out.WriteRune(c) - inString = false - continue + if ch == '"' { + out.WriteRune('"') + i++ + break + } + out.WriteRune(ch) + i++ } - // Not a structural close — this is an unescaped interior quote - out.WriteRune('\\') - out.WriteRune('"') continue } - out.WriteRune(c) + // Value string — find the correct close using last-valid-candidate heuristic + out.WriteRune('"') + i++ + + closeIdx := findClosingQuote(runes, i) + + // Write everything between open and close, escaping interior quotes + for j := i; j < closeIdx; j++ { + ch := runes[j] + if ch == '\\' && j+1 < closeIdx { + // Already-escaped sequence — pass through + out.WriteRune(ch) + j++ + out.WriteRune(runes[j]) + } else if ch == '"' { + out.WriteRune('\\') + out.WriteRune('"') + } else { + out.WriteRune(ch) + } + } + + // Write the closing quote + out.WriteRune('"') + i = closeIdx + 1 } return out.String() } + +// isValuePosition determines if the quote at position i is opening a JSON value +// string (as opposed to an object key). We only apply repair to values that +// follow ':' since those are the free-text fields where LLMs produce unescaped +// quotes. Array elements and keys are left alone (parsed normally). +func isValuePosition(runes []rune, i int) bool { + // Look backward, skipping whitespace, for the preceding structural char + j := i - 1 + for j >= 0 && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j-- + } + if j < 0 { + return false + } + // After ':' → definitely a value + return runes[j] == ':' +} + +// findClosingQuote finds the index of the true closing quote for a JSON string +// value starting at position start (the character after the opening quote). +// It collects all unescaped quote candidates and returns the FIRST one that +// produces valid JSON continuation (deeper lookahead verifies the next token). +func findClosingQuote(runes []rune, start int) int { + // Collect all candidate positions for the closing quote. + var candidates []int + for j := start; j < len(runes); j++ { + if runes[j] == '\\' { + j++ // skip escaped character + continue + } + if runes[j] == '"' { + candidates = append(candidates, j) + } + } + + if len(candidates) == 0 { + return len(runes) + } + + if len(candidates) == 1 { + return candidates[0] + } + + // Try candidates from FIRST to LAST. The correct closing quote is the + // earliest one that produces valid JSON structure after it (verified by + // deeper lookahead that checks the next token is a valid JSON start). + for _, idx := range candidates { + if isValidJSONAfterClose(runes, idx+1) { + return idx + } + } + + // Fallback: return the last candidate + return candidates[len(candidates)-1] +} + +// isValidJSONAfterClose checks whether the runes after a candidate closing quote +// look like valid JSON continuation for a VALUE string. Since we only use this +// for value positions, ':' is NOT a valid continuation (values are never keys). +// Checks deeper structure to avoid being fooled by JSON-like content in strings. +func isValidJSONAfterClose(runes []rune, pos int) bool { + j := pos + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + + if j >= len(runes) { + return true + } + + next := runes[j] + if next == '}' || next == ']' { + // Closing a container. Verify what follows the close is also valid: + // another structural char, comma, or EOF. + return isValidAfterContainerClose(runes, j+1) + } + if next == ',' { + // After comma, must be followed by a valid JSON token + j++ + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return false // trailing comma with nothing after — invalid + } + return isJSONTokenStart(runes, j) + } + // ':' is NOT valid here — we're in a value position, not a key. + // Any other character is also invalid. + return false +} + +// isValidAfterContainerClose checks that after a } or ], the continuation is +// structurally valid: more closes, comma+token, or EOF. +func isValidAfterContainerClose(runes []rune, pos int) bool { + j := pos + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return true + } + next := runes[j] + if next == '}' || next == ']' { + return isValidAfterContainerClose(runes, j+1) + } + if next == ',' { + j++ + for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') { + j++ + } + if j >= len(runes) { + return false + } + return isJSONTokenStart(runes, j) + } + return false +} + +// isJSONTokenStart returns true if the rune could begin a JSON value or key. +// For keywords (true/false/null), verifies the full keyword is present. +func isJSONTokenStart(runes []rune, pos int) bool { + if pos >= len(runes) { + return false + } + r := runes[pos] + switch { + case r == '"': // string + return true + case r == '{' || r == '[': // object or array + return true + case r == 't': // true + return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "true" + case r == 'f': // false + return pos+5 <= len(runes) && string(runes[pos:pos+5]) == "false" + case r == 'n': // null + return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "null" + case r >= '0' && r <= '9': // number + return true + case r == '-': // negative number + return true + } + return false +} diff --git a/review/parser_test.go b/review/parser_test.go index ae8c071..56c6508 100644 --- a/review/parser_test.go +++ b/review/parser_test.go @@ -149,3 +149,76 @@ func TestRepairJSON_FixesUnescapedQuotes(t *testing.T) { t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) } } + +func TestRepairJSON_InteriorQuoteBeforeComma(t *testing.T) { + // Bug reported by reviewer: interior quoted word immediately before a comma + input := `{"msg": "say "yes", and go"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + // The full string content should be preserved + msg, ok := m["msg"].(string) + if !ok { + t.Fatal("msg field missing or not a string") + } + if msg != `say "yes", and go` { + t.Errorf("unexpected msg content: %q", msg) + } +} + +func TestRepairJSON_InteriorQuoteBeforeCloseBrace(t *testing.T) { + // Bug reported by reviewer: JSON-shaped syntax inside string values + input := `{"msg": "input map {"key": "val"} caused error"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} + +func TestRepairJSON_MultipleFields(t *testing.T) { + // Multiple string fields with unescaped quotes in different positions + input := `{"a": "hello "world"", "b": "foo"}` + result := repairJSON(input) + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + if _, ok := m["b"]; !ok { + t.Error("expected 'b' field to be preserved") + } +} + +func TestRepairJSON_PreservesEscapedQuotes(t *testing.T) { + // Already-escaped quotes should not be double-escaped + input := `{"msg": "already \"escaped\" here"}` + result := repairJSON(input) + + if result != input { + t.Errorf("repairJSON should not modify already-escaped quotes\n got: %s\n want: %s", result, input) + } + + var m map[string]interface{} + if err := json.Unmarshal([]byte(result), &m); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } +} + +func TestRepairJSON_ComplexNestedContent(t *testing.T) { + // Combines both reviewer bugs: quoted words before commas AND JSON-like content + input := `{"verdict": "APPROVE", "findings": [{"finding": "The map {"key": "val"} and (e.g. "28") and say "yes", then stop"}]}` + result := repairJSON(input) + + var parsed map[string]interface{} + if err := json.Unmarshal([]byte(result), &parsed); err != nil { + t.Fatalf("repaired JSON should parse, got: %v\nrepaired: %s", err, result) + } + if parsed["verdict"] != "APPROVE" { + t.Errorf("expected verdict APPROVE, got %v", parsed["verdict"]) + } +}