Files
review-bot/review/parser.go
claw db479d0ff4
CI / test (pull_request) Successful in 15s
CI / review (/openai/v1, gpt-4.1, gpt41, openai, GPT_REVIEW_TOKEN) (pull_request) Successful in 25s
CI / review (/openai/v1, gpt-4.1-mini, gpt41-mini, openai, GPT_REVIEW_TOKEN) (pull_request) Successful in 29s
CI / review (/anthropic/v1, claude-sonnet-4-6, sonnet, anthropic, SONNET_REVIEW_TOKEN) (pull_request) Successful in 49s
CI / review (/openai/v1, gpt-5, security, openai, SECURITY_REVIEW.md, SECURITY_REVIEW_TOKEN) (pull_request) Successful in 50s
CI / review (/openai/v1, gpt-5, gpt, openai, GPT_REVIEW_TOKEN) (pull_request) Successful in 1m15s
CI / review (/openai/v1, gpt-5-mini, gpt5-mini, openai, GPT_REVIEW_TOKEN) (pull_request) Successful in 52s
fix: retry on transient LLM response body truncation
Addresses intermittent 'unexpected end of JSON input' failures where the
LLM response body is truncated in transit between the proxy and client.

Root cause: network-level truncation where io.ReadAll returns partial data
(observed in 3/50 CI runs through HAI proxy). The response body reading
was already using io.ReadAll correctly, but transient network issues
between the proxy and client can still cause partial reads.

Changes:
- Add Content-Length validation in doRequest: detect when fewer bytes
  arrive than the server declared, triggering a retry
- Add retry logic in Complete: retries once on retryable errors (body
  read failures, content-length mismatches) with a 500ms backoff
- Add parse-level retry in main: if ParseResponse fails, re-requests
  from the LLM once before giving up (defensive, since retries always
  succeed per issue evidence)
- Improve ParseResponse error diagnostics: log raw vs cleaned lengths
  and a preview of the cleaned content to aid future debugging

Does NOT retry on API errors (4xx/5xx) or structural issues — only
transient body read problems.

Closes #47
2026-05-07 00:44:32 -07:00

316 lines
8.7 KiB
Go

package review
import (
"encoding/json"
"fmt"
"strings"
)
// Finding represents a single code review finding.
type Finding struct {
Severity string `json:"severity"`
File string `json:"file"`
Line int `json:"line"`
Finding string `json:"finding"`
}
// ReviewResult is the structured output from the LLM.
type ReviewResult struct {
Verdict string `json:"verdict"`
Summary string `json:"summary"`
Findings []Finding `json:"findings"`
Recommendation string `json:"recommendation"`
}
// ParseResponse parses the LLM response into a ReviewResult.
func ParseResponse(response string) (*ReviewResult, error) {
// Try to extract JSON from the response — the LLM might wrap it in markdown fences
cleaned := extractJSON(response)
var result ReviewResult
if err := json.Unmarshal([]byte(cleaned), &result); err != nil {
// LLMs sometimes produce JSON with unescaped quotes inside string values.
// Try to repair before giving up.
repaired := repairJSON(cleaned)
if err2 := json.Unmarshal([]byte(repaired), &result); err2 != nil {
// Include diagnostic info: lengths help identify truncation
rawLen := len(response)
cleanedLen := len(cleaned)
preview := cleaned
if len(preview) > 200 {
preview = preview[:100] + "..." + preview[len(preview)-100:]
}
return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw length: %d, cleaned length: %d\nCleaned preview: %s", err, rawLen, cleanedLen, preview)
}
}
// Validate verdict
switch result.Verdict {
case "APPROVE", "REQUEST_CHANGES":
// valid
default:
return nil, fmt.Errorf("invalid verdict %q (must be APPROVE or REQUEST_CHANGES)", result.Verdict)
}
// Validate finding severities
for i, f := range result.Findings {
switch f.Severity {
case "MAJOR", "MINOR", "NIT":
// valid
default:
return nil, fmt.Errorf("finding %d has invalid severity %q", i, f.Severity)
}
}
return &result, nil
}
// extractJSON attempts to pull JSON from a potentially markdown-wrapped response.
func extractJSON(s string) string {
s = strings.TrimSpace(s)
// Remove markdown code fences if present
if strings.HasPrefix(s, "```") {
lines := strings.Split(s, "\n")
// Remove first line (```json or ```)
if len(lines) > 2 {
lines = lines[1:]
}
// Remove last line (```)
if len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "```" {
lines = lines[:len(lines)-1]
}
s = strings.Join(lines, "\n")
}
s = strings.TrimSpace(s)
return s
}
// repairJSON attempts to fix common LLM JSON issues:
// - Unescaped double quotes inside string values
//
// Strategy: walk the JSON structurally. Object keys are parsed normally (LLMs
// get those right). For string VALUES, we find all candidate closing quotes and
// pick the LAST one that leaves valid JSON structure afterward — maximizing
// string content, which is the correct bias for the "LLM put unescaped quotes
// in a string value" failure mode.
func repairJSON(s string) string {
runes := []rune(s)
var out strings.Builder
out.Grow(len(s) + 64)
i := 0
for i < len(runes) {
c := runes[i]
if c != '"' {
out.WriteRune(c)
i++
continue
}
// We hit an opening quote. Determine if this is a key or a value.
// Keys: the standard JSON parser in LLMs gets keys right, so we parse
// them normally (first unescaped quote closes).
// Values: may contain unescaped quotes — use the repair heuristic.
isValue := isValuePosition(runes, i)
if !isValue {
// Parse key/simple string normally
out.WriteRune('"')
i++
for i < len(runes) {
ch := runes[i]
if ch == '\\' && i+1 < len(runes) {
out.WriteRune(ch)
i++
out.WriteRune(runes[i])
i++
continue
}
if ch == '"' {
out.WriteRune('"')
i++
break
}
out.WriteRune(ch)
i++
}
continue
}
// Value string — find the correct close using last-valid-candidate heuristic
out.WriteRune('"')
i++
closeIdx := findClosingQuote(runes, i)
// Write everything between open and close, escaping interior quotes
for j := i; j < closeIdx; j++ {
ch := runes[j]
if ch == '\\' && j+1 < closeIdx {
// Already-escaped sequence — pass through
out.WriteRune(ch)
j++
out.WriteRune(runes[j])
} else if ch == '"' {
out.WriteRune('\\')
out.WriteRune('"')
} else {
out.WriteRune(ch)
}
}
// Write the closing quote
out.WriteRune('"')
i = closeIdx + 1
}
return out.String()
}
// isValuePosition determines if the quote at position i is opening a JSON value
// string (as opposed to an object key). We only apply repair to values that
// follow ':' since those are the free-text fields where LLMs produce unescaped
// quotes. Array elements and keys are left alone (parsed normally).
func isValuePosition(runes []rune, i int) bool {
// Look backward, skipping whitespace, for the preceding structural char
j := i - 1
for j >= 0 && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
j--
}
if j < 0 {
return false
}
// After ':' → definitely a value
return runes[j] == ':'
}
// findClosingQuote finds the index of the true closing quote for a JSON string
// value starting at position start (the character after the opening quote).
// It collects all unescaped quote candidates and returns the FIRST one that
// produces valid JSON continuation (deeper lookahead verifies the next token).
func findClosingQuote(runes []rune, start int) int {
// Collect all candidate positions for the closing quote.
var candidates []int
for j := start; j < len(runes); j++ {
if runes[j] == '\\' {
j++ // skip escaped character
continue
}
if runes[j] == '"' {
candidates = append(candidates, j)
}
}
if len(candidates) == 0 {
return len(runes)
}
if len(candidates) == 1 {
return candidates[0]
}
// Try candidates from FIRST to LAST. The correct closing quote is the
// earliest one that produces valid JSON structure after it (verified by
// deeper lookahead that checks the next token is a valid JSON start).
for _, idx := range candidates {
if isValidJSONAfterClose(runes, idx+1) {
return idx
}
}
// Fallback: return the last candidate
return candidates[len(candidates)-1]
}
// isValidJSONAfterClose checks whether the runes after a candidate closing quote
// look like valid JSON continuation for a VALUE string. Since we only use this
// for value positions, ':' is NOT a valid continuation (values are never keys).
// Checks deeper structure to avoid being fooled by JSON-like content in strings.
func isValidJSONAfterClose(runes []rune, pos int) bool {
j := pos
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
j++
}
if j >= len(runes) {
return true
}
next := runes[j]
if next == '}' || next == ']' {
// Closing a container. Verify what follows the close is also valid:
// another structural char, comma, or EOF.
return isValidAfterContainerClose(runes, j+1)
}
if next == ',' {
// After comma, must be followed by a valid JSON token
j++
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
j++
}
if j >= len(runes) {
return false // trailing comma with nothing after — invalid
}
return isJSONTokenStart(runes, j)
}
// ':' is NOT valid here — we're in a value position, not a key.
// Any other character is also invalid.
return false
}
// isValidAfterContainerClose checks that after a } or ], the continuation is
// structurally valid: more closes, comma+token, or EOF.
func isValidAfterContainerClose(runes []rune, pos int) bool {
j := pos
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
j++
}
if j >= len(runes) {
return true
}
next := runes[j]
if next == '}' || next == ']' {
return isValidAfterContainerClose(runes, j+1)
}
if next == ',' {
j++
for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
j++
}
if j >= len(runes) {
return false
}
return isJSONTokenStart(runes, j)
}
return false
}
// isJSONTokenStart returns true if the rune could begin a JSON value or key.
// For keywords (true/false/null), verifies the full keyword is present.
func isJSONTokenStart(runes []rune, pos int) bool {
if pos >= len(runes) {
return false
}
r := runes[pos]
switch {
case r == '"': // string
return true
case r == '{' || r == '[': // object or array
return true
case r == 't': // true
return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "true"
case r == 'f': // false
return pos+5 <= len(runes) && string(runes[pos:pos+5]) == "false"
case r == 'n': // null
return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "null"
case r >= '0' && r <= '9': // number
return true
case r == '-': // negative number
return true
}
return false
}