review-bot/review/parser.go

package review

import (
	"encoding/json"
	"fmt"
	"strings"
)

// Finding represents a single code review finding.
type Finding struct {
	Severity string `json:"severity"`
	File     string `json:"file"`
	Line     int    `json:"line"`
	Finding  string `json:"finding"`
}

// ReviewResult is the structured output from the LLM.
type ReviewResult struct {
	Verdict        string    `json:"verdict"`
	Summary        string    `json:"summary"`
	Findings       []Finding `json:"findings"`
	Recommendation string    `json:"recommendation"`
}

// ParseResponse parses the LLM response into a ReviewResult.
func ParseResponse(response string) (*ReviewResult, error) {
	// Try to extract JSON from the response — the LLM might wrap it in markdown fences
	cleaned := extractJSON(response)

	var result ReviewResult
	if err := json.Unmarshal([]byte(cleaned), &result); err != nil {
		// LLMs sometimes produce JSON with unescaped quotes inside string values.
		// Try to repair before giving up.
		repaired := repairJSON(cleaned)
		if err2 := json.Unmarshal([]byte(repaired), &result); err2 != nil {
			// Include diagnostic info: lengths help identify truncation
			rawLen := len(response)
			cleanedLen := len(cleaned)
			preview := cleaned
			if len(preview) > 200 {
				preview = preview[:100] + "..." + preview[len(preview)-100:]
			}
			return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw length: %d, cleaned length: %d\nCleaned preview: %s", err, rawLen, cleanedLen, preview)
		}
	}

	// Validate verdict
	switch result.Verdict {
	case "APPROVE", "REQUEST_CHANGES":
		// valid
	default:
		return nil, fmt.Errorf("invalid verdict %q (must be APPROVE or REQUEST_CHANGES)", result.Verdict)
	}

	// Validate finding severities
	for i, f := range result.Findings {
		switch f.Severity {
		case "MAJOR", "MINOR", "NIT":
			// valid
		default:
			return nil, fmt.Errorf("finding %d has invalid severity %q", i, f.Severity)
		}
	}

	return &result, nil
}

// extractJSON attempts to pull JSON from a potentially markdown-wrapped response.
func extractJSON(s string) string {
	s = strings.TrimSpace(s)

	// Remove markdown code fences if present
	if strings.HasPrefix(s, "```") {
		lines := strings.Split(s, "\n")
		// Remove first line (```json or ```)
		if len(lines) > 2 {
			lines = lines[1:]
		}
		// Remove last line (```)
		if len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "```" {
			lines = lines[:len(lines)-1]
		}
		s = strings.Join(lines, "\n")
	}

	s = strings.TrimSpace(s)
	return s
}

// repairJSON attempts to fix common LLM JSON issues:
// - Unescaped double quotes inside string values
//
// Strategy: walk the JSON structurally. Object keys are parsed normally (LLMs
// get those right). For string VALUES, we find all candidate closing quotes and
// pick the LAST one that leaves valid JSON structure afterward — maximizing
// string content, which is the correct bias for the "LLM put unescaped quotes
// in a string value" failure mode.
func repairJSON(s string) string {
	runes := []rune(s)
	var out strings.Builder
	out.Grow(len(s) + 64)

	i := 0
	for i < len(runes) {
		c := runes[i]

		if c != '"' {
			out.WriteRune(c)
			i++
			continue
		}

		// We hit an opening quote. Determine if this is a key or a value.
		// Keys: the standard JSON parser in LLMs gets keys right, so we parse
		// them normally (first unescaped quote closes).
		// Values: may contain unescaped quotes — use the repair heuristic.
		isValue := isValuePosition(runes, i)

		if !isValue {
			// Parse key/simple string normally
			out.WriteRune('"')
			i++
			for i < len(runes) {
				ch := runes[i]
				if ch == '\\' && i+1 < len(runes) {
					out.WriteRune(ch)
					i++
					out.WriteRune(runes[i])
					i++
					continue
				}
				if ch == '"' {
					out.WriteRune('"')
					i++
					break
				}
				out.WriteRune(ch)
				i++
			}
			continue
		}

		// Value string — find the correct close using last-valid-candidate heuristic
		out.WriteRune('"')
		i++

		closeIdx := findClosingQuote(runes, i)

		// Write everything between open and close, escaping interior quotes
		for j := i; j < closeIdx; j++ {
			ch := runes[j]
			if ch == '\\' && j+1 < closeIdx {
				// Already-escaped sequence — pass through
				out.WriteRune(ch)
				j++
				out.WriteRune(runes[j])
			} else if ch == '"' {
				out.WriteRune('\\')
				out.WriteRune('"')
			} else {
				out.WriteRune(ch)
			}
		}

		// Write the closing quote
		out.WriteRune('"')
		i = closeIdx + 1
	}

	return out.String()
}

// isValuePosition determines if the quote at position i is opening a JSON value
// string (as opposed to an object key). We only apply repair to values that
// follow ':' since those are the free-text fields where LLMs produce unescaped
// quotes. Array elements and keys are left alone (parsed normally).
func isValuePosition(runes []rune, i int) bool {
	// Look backward, skipping whitespace, for the preceding structural char
	j := i - 1
	for j >= 0 && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
		j--
	}
	if j < 0 {
		return false
	}
	// After ':' → definitely a value
	return runes[j] == ':'
}

// findClosingQuote finds the index of the true closing quote for a JSON string
// value starting at position start (the character after the opening quote).
// It collects all unescaped quote candidates and returns the FIRST one that
// produces valid JSON continuation (deeper lookahead verifies the next token).
func findClosingQuote(runes []rune, start int) int {
	// Collect all candidate positions for the closing quote.
	var candidates []int
	for j := start; j < len(runes); j++ {
		if runes[j] == '\\' {
			j++ // skip escaped character
			continue
		}
		if runes[j] == '"' {
			candidates = append(candidates, j)
		}
	}

	if len(candidates) == 0 {
		return len(runes)
	}

	if len(candidates) == 1 {
		return candidates[0]
	}

	// Try candidates from FIRST to LAST. The correct closing quote is the
	// earliest one that produces valid JSON structure after it (verified by
	// deeper lookahead that checks the next token is a valid JSON start).
	for _, idx := range candidates {
		if isValidJSONAfterClose(runes, idx+1) {
			return idx
		}
	}

	// Fallback: return the last candidate
	return candidates[len(candidates)-1]
}

// isValidJSONAfterClose checks whether the runes after a candidate closing quote
// look like valid JSON continuation for a VALUE string. Since we only use this
// for value positions, ':' is NOT a valid continuation (values are never keys).
// Checks deeper structure to avoid being fooled by JSON-like content in strings.
func isValidJSONAfterClose(runes []rune, pos int) bool {
	j := pos
	for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
		j++
	}

	if j >= len(runes) {
		return true
	}

	next := runes[j]
	if next == '}' || next == ']' {
		// Closing a container. Verify what follows the close is also valid:
		// another structural char, comma, or EOF.
		return isValidAfterContainerClose(runes, j+1)
	}
	if next == ',' {
		// After comma, must be followed by a valid JSON token
		j++
		for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
			j++
		}
		if j >= len(runes) {
			return false // trailing comma with nothing after — invalid
		}
		return isJSONTokenStart(runes, j)
	}
	// ':' is NOT valid here — we're in a value position, not a key.
	// Any other character is also invalid.
	return false
}

// isValidAfterContainerClose checks that after a } or ], the continuation is
// structurally valid: more closes, comma+token, or EOF.
func isValidAfterContainerClose(runes []rune, pos int) bool {
	j := pos
	for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
		j++
	}
	if j >= len(runes) {
		return true
	}
	next := runes[j]
	if next == '}' || next == ']' {
		return isValidAfterContainerClose(runes, j+1)
	}
	if next == ',' {
		j++
		for j < len(runes) && (runes[j] == ' ' || runes[j] == '\t' || runes[j] == '\n' || runes[j] == '\r') {
			j++
		}
		if j >= len(runes) {
			return false
		}
		return isJSONTokenStart(runes, j)
	}
	return false
}

// isJSONTokenStart returns true if the rune could begin a JSON value or key.
// For keywords (true/false/null), verifies the full keyword is present.
func isJSONTokenStart(runes []rune, pos int) bool {
	if pos >= len(runes) {
		return false
	}
	r := runes[pos]
	switch {
	case r == '"': // string
		return true
	case r == '{' || r == '[': // object or array
		return true
	case r == 't': // true
		return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "true"
	case r == 'f': // false
		return pos+5 <= len(runes) && string(runes[pos:pos+5]) == "false"
	case r == 'n': // null
		return pos+4 <= len(runes) && string(runes[pos:pos+4]) == "null"
	case r >= '0' && r <= '9': // number
		return true
	case r == '-': // negative number
		return true
	}
	return false
}