fix: retry on transient LLM response body truncation

Addresses intermittent 'unexpected end of JSON input' failures where the LLM response body is truncated in transit between the proxy and client. Root cause: network-level truncation where io.ReadAll returns partial data (observed in 3/50 CI runs through HAI proxy). The response body reading was already using io.ReadAll correctly, but transient network issues between the proxy and client can still cause partial reads. Changes: - Add Content-Length validation in doRequest: detect when fewer bytes arrive than the server declared, triggering a retry - Add retry logic in Complete: retries once on retryable errors (body read failures, content-length mismatches) with a 500ms backoff - Add parse-level retry in main: if ParseResponse fails, re-requests from the LLM once before giving up (defensive, since retries always succeed per issue evidence) - Improve ParseResponse error diagnostics: log raw vs cleaned lengths and a preview of the cleaned content to aid future debugging Does NOT retry on API errors (4xx/5xx) or structural issues — only transient body read problems. Closes #47
2026-05-07 00:44:32 -07:00
parent cabbb5a55a
commit db479d0ff4
4 changed files with 216 additions and 18 deletions
@@ -254,25 +254,41 @@ func main() {
 		slog.Warn("context trimmed to fit budget", "trimmed", budgetResult.Trimmed)
 	}

-	// Step 8: Call LLM
+	// Step 8: Call LLM (with retry on parse failure)
 	slog.Info("sending request to LLM", "model", *llmModel)
 	messages := []llm.Message{
 		{Role: "system", Content: budgetResult.SystemPrompt},
 		{Role: "user", Content: budgetResult.UserPrompt},
 	}

-	response, err := llmClient.Complete(ctx, messages)
-	if err != nil {
-		slog.Error("LLM request failed", "model", *llmModel, "error", err)
-		os.Exit(1)
-	}
-	slog.Info("LLM response received", "bytes", len(response))
+	var response string
+	var result *review.ReviewResult
+	for attempt := 1; attempt <= 2; attempt++ {
+		if attempt > 1 {
+			slog.Warn("retrying LLM request after parse failure", "attempt", attempt)
+			time.Sleep(time.Second)
+		}

-	// Step 9: Parse response
-	result, err := review.ParseResponse(response)
-	if err != nil {
-		slog.Error("failed to parse LLM response", "error", err)
-		os.Exit(1)
+		response, err = llmClient.Complete(ctx, messages)
+		if err != nil {
+			slog.Error("LLM request failed", "model", *llmModel, "error", err, "attempt", attempt)
+			if attempt == 2 {
+				os.Exit(1)
+			}
+			continue
+		}
+		slog.Info("LLM response received", "bytes", len(response), "attempt", attempt)
+
+		// Step 9: Parse response
+		result, err = review.ParseResponse(response)
+		if err != nil {
+			slog.Error("failed to parse LLM response", "error", err, "attempt", attempt)
+			if attempt == 2 {
+				os.Exit(1)
+			}
+			continue
+		}
+		break
 	}
 	slog.Info("review parsed", "verdict", result.Verdict, "findings", len(result.Findings))

@@ -75,12 +75,52 @@ type Message struct {
 // Complete sends a chat completion request and returns the assistant's response content.
 // The first message with role "system" is treated as the system prompt.
 func (c *Client) Complete(ctx context.Context, messages []Message) (string, error) {
-	switch c.provider {
-	case ProviderAnthropic:
-		return c.completeAnthropic(ctx, messages)
-	default:
-		return c.completeOpenAI(ctx, messages)
+	var result string
+	var err error
+
+	for attempt := 0; attempt < 2; attempt++ {
+		switch c.provider {
+		case ProviderAnthropic:
+			result, err = c.completeAnthropic(ctx, messages)
+		default:
+			result, err = c.completeOpenAI(ctx, messages)
+		}
+
+		if err == nil {
+			return result, nil
+		}
+
+		// Only retry on response body read errors (transient network issues).
+		// Do not retry on context cancellation, status errors, or parse errors
+		// that indicate a structural API problem.
+		if !isRetryableError(err) {
+			return "", err
+		}
+
+		if attempt == 0 && ctx.Err() == nil {
+			// Brief pause before retry to allow transient issues to resolve.
+			time.Sleep(500 * time.Millisecond)
+		}
 	}
+
+	return "", err
+}
+
+// isRetryableError returns true for transient errors worth retrying.
+func isRetryableError(err error) bool {
+	if err == nil {
+		return false
+	}
+	s := err.Error()
+	// Body read failures (connection reset, truncation)
+	if strings.Contains(s, "read response") {
+		return true
+	}
+	// Unexpected body length (our content-length validation)
+	if strings.Contains(s, "body length mismatch") {
+		return true
+	}
+	return false
 }

 // --- OpenAI-compatible implementation ---
@@ -231,6 +271,12 @@ func (c *Client) doRequest(req *http.Request, parse func([]byte) (string, error)
 		return "", fmt.Errorf("read response: %w", err)
 	}

+	// Validate body length against Content-Length header when present.
+	// A mismatch indicates the response was truncated in transit.
+	if cl := resp.ContentLength; cl > 0 && int64(len(body)) < cl {
+		return "", fmt.Errorf("body length mismatch: Content-Length=%d, received=%d", cl, len(body))
+	}
+
 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
 		return "", fmt.Errorf("LLM API error (status %d): %s", resp.StatusCode, string(body))
 	}
@@ -3,6 +3,7 @@ package llm
 import (
 	"context"
 	"encoding/json"
+	"fmt"
 	"net/http"
 	"net/http/httptest"
 	"testing"
@@ -295,3 +296,131 @@ func TestWithProvider(t *testing.T) {
 		t.Errorf("expected provider anthropic, got %s", client.provider)
 	}
 }
+
+func TestComplete_RetryOnBodyReadError(t *testing.T) {
+	attempts := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		attempts++
+		if attempts == 1 {
+			// First attempt: send headers then close connection abruptly
+			// Simulate by writing partial response and flushing with wrong Content-Length
+			w.Header().Set("Content-Length", "1000")
+			w.WriteHeader(http.StatusOK)
+			w.Write([]byte(`{"choices":[{"message":{"con`))
+			// The test HTTP server will close the connection after handler returns,
+			// but Content-Length mismatch means client gets fewer bytes than expected
+			return
+		}
+		// Second attempt: succeed
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(ChatResponse{
+			Choices: []struct {
+				Message struct {
+					Content string `json:"content"`
+				} `json:"message"`
+			}{{Message: struct {
+				Content string `json:"content"`
+			}{Content: "success"}}},
+		})
+	}))
+	defer server.Close()
+
+	client := NewClient(server.URL, "key", "model")
+	got, err := client.Complete(context.Background(), []Message{{Role: "user", Content: "Hi"}})
+	if err != nil {
+		t.Fatalf("expected retry to succeed, got error: %v", err)
+	}
+	if got != "success" {
+		t.Errorf("expected %q, got %q", "success", got)
+	}
+	if attempts != 2 {
+		t.Errorf("expected 2 attempts, got %d", attempts)
+	}
+}
+
+func TestComplete_ContentLengthMismatch(t *testing.T) {
+	attempts := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		attempts++
+		if attempts == 1 {
+			// Claim Content-Length is larger than actual body
+			w.Header().Set("Content-Length", "500")
+			w.Header().Set("Content-Type", "application/json")
+			w.WriteHeader(http.StatusOK)
+			// Write less than 500 bytes
+			w.Write([]byte(`{"choices":[{"message":{"content":"partial"}}]}`))
+			return
+		}
+		// Second attempt succeeds
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(ChatResponse{
+			Choices: []struct {
+				Message struct {
+					Content string `json:"content"`
+				} `json:"message"`
+			}{{Message: struct {
+				Content string `json:"content"`
+			}{Content: "complete"}}},
+		})
+	}))
+	defer server.Close()
+
+	client := NewClient(server.URL, "key", "model")
+	got, err := client.Complete(context.Background(), []Message{{Role: "user", Content: "Hi"}})
+	if err != nil {
+		t.Fatalf("expected retry to succeed on content-length mismatch, got: %v", err)
+	}
+	if got != "complete" {
+		t.Errorf("expected %q, got %q", "complete", got)
+	}
+}
+
+func TestComplete_NoRetryOnAPIError(t *testing.T) {
+	attempts := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		attempts++
+		w.WriteHeader(http.StatusBadRequest)
+		w.Write([]byte(`{"error":"bad request"}`))
+	}))
+	defer server.Close()
+
+	client := NewClient(server.URL, "key", "model")
+	_, err := client.Complete(context.Background(), []Message{{Role: "user", Content: "Hi"}})
+	if err == nil {
+		t.Fatal("expected error for 400, got nil")
+	}
+	if attempts != 1 {
+		t.Errorf("should not retry on API errors, got %d attempts", attempts)
+	}
+}
+
+func TestIsRetryableError(t *testing.T) {
+	tests := []struct {
+		name     string
+		err      string
+		expected bool
+	}{
+		{"nil formatted", "", false},
+		{"read response error", "read response: unexpected EOF", true},
+		{"body length mismatch", "body length mismatch: Content-Length=1000, received=500", true},
+		{"API error", "LLM API error (status 400): bad request", false},
+		{"parse error", "parse response: unexpected end of JSON input", false},
+		{"request error", "LLM request: connection refused", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if tt.err == "" {
+				if isRetryableError(nil) {
+					t.Error("nil error should not be retryable")
+				}
+				return
+			}
+			err := fmt.Errorf("%s", tt.err)
+			got := isRetryableError(err)
+			if got != tt.expected {
+				t.Errorf("isRetryableError(%q) = %v, want %v", tt.err, got, tt.expected)
+			}
+		})
+	}
+}
@@ -33,7 +33,14 @@ func ParseResponse(response string) (*ReviewResult, error) {
 		// Try to repair before giving up.
 		repaired := repairJSON(cleaned)
 		if err2 := json.Unmarshal([]byte(repaired), &result); err2 != nil {
-			return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response)
+			// Include diagnostic info: lengths help identify truncation
+			rawLen := len(response)
+			cleanedLen := len(cleaned)
+			preview := cleaned
+			if len(preview) > 200 {
+				preview = preview[:100] + "..." + preview[len(preview)-100:]
+			}
+			return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw length: %d, cleaned length: %d\nCleaned preview: %s", err, rawLen, cleanedLen, preview)
 		}
 	}