Merge pull request 'fix: retry on transient LLM response body truncation' (#48) from fix/response-body-truncation into main

Reviewed-on: #48 Reviewed-by: Aaron Weiker <[email protected]>
2026-05-08 02:32:37 +00:00
parent cabbb5a55a db479d0ff4
commit 2089ca0f2d
4 changed files with 216 additions and 18 deletions
@@ -254,25 +254,41 @@ func main() {
 		slog.Warn("context trimmed to fit budget", "trimmed", budgetResult.Trimmed)
 	}

-	// Step 8: Call LLM
+	// Step 8: Call LLM (with retry on parse failure)
 	slog.Info("sending request to LLM", "model", *llmModel)
 	messages := []llm.Message{
 		{Role: "system", Content: budgetResult.SystemPrompt},
 		{Role: "user", Content: budgetResult.UserPrompt},
 	}

-	response, err := llmClient.Complete(ctx, messages)
-	if err != nil {
-		slog.Error("LLM request failed", "model", *llmModel, "error", err)
-		os.Exit(1)
-	}
-	slog.Info("LLM response received", "bytes", len(response))
+	var response string
+	var result *review.ReviewResult
+	for attempt := 1; attempt <= 2; attempt++ {
+		if attempt > 1 {
+			slog.Warn("retrying LLM request after parse failure", "attempt", attempt)
+			time.Sleep(time.Second)
+		}

-	// Step 9: Parse response
-	result, err := review.ParseResponse(response)
-	if err != nil {
-		slog.Error("failed to parse LLM response", "error", err)
-		os.Exit(1)
+		response, err = llmClient.Complete(ctx, messages)
+		if err != nil {
+			slog.Error("LLM request failed", "model", *llmModel, "error", err, "attempt", attempt)
+			if attempt == 2 {
+				os.Exit(1)
+			}
+			continue
+		}
+		slog.Info("LLM response received", "bytes", len(response), "attempt", attempt)
+
+		// Step 9: Parse response
+		result, err = review.ParseResponse(response)
+		if err != nil {
+			slog.Error("failed to parse LLM response", "error", err, "attempt", attempt)
+			if attempt == 2 {
+				os.Exit(1)
+			}
+			continue
+		}
+		break
 	}
 	slog.Info("review parsed", "verdict", result.Verdict, "findings", len(result.Findings))

@@ -75,12 +75,52 @@ type Message struct {
 // Complete sends a chat completion request and returns the assistant's response content.
 // The first message with role "system" is treated as the system prompt.
 func (c *Client) Complete(ctx context.Context, messages []Message) (string, error) {
-	switch c.provider {
-	case ProviderAnthropic:
-		return c.completeAnthropic(ctx, messages)
-	default:
-		return c.completeOpenAI(ctx, messages)
+	var result string
+	var err error
+
+	for attempt := 0; attempt < 2; attempt++ {
+		switch c.provider {
+		case ProviderAnthropic:
+			result, err = c.completeAnthropic(ctx, messages)
+		default:
+			result, err = c.completeOpenAI(ctx, messages)
+		}
+
+		if err == nil {
+			return result, nil
+		}
+
+		// Only retry on response body read errors (transient network issues).
+		// Do not retry on context cancellation, status errors, or parse errors
+		// that indicate a structural API problem.
+		if !isRetryableError(err) {
+			return "", err
+		}
+
+		if attempt == 0 && ctx.Err() == nil {
+			// Brief pause before retry to allow transient issues to resolve.
+			time.Sleep(500 * time.Millisecond)
+		}
 	}
+
+	return "", err
+}
+
+// isRetryableError returns true for transient errors worth retrying.
+func isRetryableError(err error) bool {
+	if err == nil {
+		return false
+	}
+	s := err.Error()
+	// Body read failures (connection reset, truncation)
+	if strings.Contains(s, "read response") {
+		return true
+	}
+	// Unexpected body length (our content-length validation)
+	if strings.Contains(s, "body length mismatch") {
+		return true
+	}
+	return false
 }

 // --- OpenAI-compatible implementation ---
@@ -231,6 +271,12 @@ func (c *Client) doRequest(req *http.Request, parse func([]byte) (string, error)
 		return "", fmt.Errorf("read response: %w", err)
 	}

+	// Validate body length against Content-Length header when present.
+	// A mismatch indicates the response was truncated in transit.
+	if cl := resp.ContentLength; cl > 0 && int64(len(body)) < cl {
+		return "", fmt.Errorf("body length mismatch: Content-Length=%d, received=%d", cl, len(body))
+	}
+
 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
 		return "", fmt.Errorf("LLM API error (status %d): %s", resp.StatusCode, string(body))
 	}
@@ -3,6 +3,7 @@ package llm
 import (
 	"context"
 	"encoding/json"
+	"fmt"
 	"net/http"
 	"net/http/httptest"
 	"testing"
@@ -295,3 +296,131 @@ func TestWithProvider(t *testing.T) {
 		t.Errorf("expected provider anthropic, got %s", client.provider)
 	}
 }
+
+func TestComplete_RetryOnBodyReadError(t *testing.T) {
+	attempts := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		attempts++
+		if attempts == 1 {
+			// First attempt: send headers then close connection abruptly
+			// Simulate by writing partial response and flushing with wrong Content-Length
+			w.Header().Set("Content-Length", "1000")
+			w.WriteHeader(http.StatusOK)
+			w.Write([]byte(`{"choices":[{"message":{"con`))
+			// The test HTTP server will close the connection after handler returns,
+			// but Content-Length mismatch means client gets fewer bytes than expected
+			return
+		}
+		// Second attempt: succeed
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(ChatResponse{
+			Choices: []struct {
+				Message struct {
+					Content string `json:"content"`
+				} `json:"message"`
+			}{{Message: struct {
+				Content string `json:"content"`
+			}{Content: "success"}}},
+		})
+	}))
+	defer server.Close()
+
+	client := NewClient(server.URL, "key", "model")
+	got, err := client.Complete(context.Background(), []Message{{Role: "user", Content: "Hi"}})
+	if err != nil {
+		t.Fatalf("expected retry to succeed, got error: %v", err)
+	}
+	if got != "success" {
+		t.Errorf("expected %q, got %q", "success", got)
+	}
+	if attempts != 2 {
+		t.Errorf("expected 2 attempts, got %d", attempts)
+	}
+}
+
+func TestComplete_ContentLengthMismatch(t *testing.T) {
+	attempts := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		attempts++
+		if attempts == 1 {
+			// Claim Content-Length is larger than actual body
+			w.Header().Set("Content-Length", "500")
+			w.Header().Set("Content-Type", "application/json")
+			w.WriteHeader(http.StatusOK)
+			// Write less than 500 bytes
+			w.Write([]byte(`{"choices":[{"message":{"content":"partial"}}]}`))
+			return
+		}
+		// Second attempt succeeds
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(ChatResponse{
+			Choices: []struct {
+				Message struct {
+					Content string `json:"content"`
+				} `json:"message"`
+			}{{Message: struct {
+				Content string `json:"content"`
+			}{Content: "complete"}}},
+		})
+	}))
+	defer server.Close()
+
+	client := NewClient(server.URL, "key", "model")
+	got, err := client.Complete(context.Background(), []Message{{Role: "user", Content: "Hi"}})
+	if err != nil {
+		t.Fatalf("expected retry to succeed on content-length mismatch, got: %v", err)
+	}
+	if got != "complete" {
+		t.Errorf("expected %q, got %q", "complete", got)
+	}
+}
+
+func TestComplete_NoRetryOnAPIError(t *testing.T) {
+	attempts := 0
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		attempts++
+		w.WriteHeader(http.StatusBadRequest)
+		w.Write([]byte(`{"error":"bad request"}`))
+	}))
+	defer server.Close()
+
+	client := NewClient(server.URL, "key", "model")
+	_, err := client.Complete(context.Background(), []Message{{Role: "user", Content: "Hi"}})
+	if err == nil {
+		t.Fatal("expected error for 400, got nil")
+	}
+	if attempts != 1 {
+		t.Errorf("should not retry on API errors, got %d attempts", attempts)
+	}
+}
+
+func TestIsRetryableError(t *testing.T) {
+	tests := []struct {
+		name     string
+		err      string
+		expected bool
+	}{
+		{"nil formatted", "", false},
+		{"read response error", "read response: unexpected EOF", true},
+		{"body length mismatch", "body length mismatch: Content-Length=1000, received=500", true},
+		{"API error", "LLM API error (status 400): bad request", false},
+		{"parse error", "parse response: unexpected end of JSON input", false},
+		{"request error", "LLM request: connection refused", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if tt.err == "" {
+				if isRetryableError(nil) {
+					t.Error("nil error should not be retryable")
+				}
+				return
+			}
+			err := fmt.Errorf("%s", tt.err)
+			got := isRetryableError(err)
+			if got != tt.expected {
+				t.Errorf("isRetryableError(%q) = %v, want %v", tt.err, got, tt.expected)
+			}
+		})
+	}
+}
@@ -33,7 +33,14 @@ func ParseResponse(response string) (*ReviewResult, error) {
 		// Try to repair before giving up.
 		repaired := repairJSON(cleaned)
 		if err2 := json.Unmarshal([]byte(repaired), &result); err2 != nil {
-			return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw response: %s", err, response)
+			// Include diagnostic info: lengths help identify truncation
+			rawLen := len(response)
+			cleanedLen := len(cleaned)
+			preview := cleaned
+			if len(preview) > 200 {
+				preview = preview[:100] + "..." + preview[len(preview)-100:]
+			}
+			return nil, fmt.Errorf("parse LLM response as JSON: %w\nRaw length: %d, cleaned length: %d\nCleaned preview: %s", err, rawLen, cleanedLen, preview)
 		}
 	}