feat(gitea): add retry logic for 5xx errors #69

Merged
aweiker merged 7 commits from issue-68 into main 2026-05-11 12:59:50 +00:00
2 changed files with 111 additions and 37 deletions
Showing only changes of commit 5498dccd60 - Show all commits
+21 -1
View File
5
@@ -72,6 +72,12 @@ func NewClient(baseURL, token string) *Client {
}
Review

[NIT] SetHTTPClient is exported and documented as 'intended for testing'. Exporting test-only methods on production types is generally discouraged — it pollutes the public API and could be misused. Consider using the export_test.go pattern to expose this only during tests, or making the http field settable via a constructor option.

**[NIT]** `SetHTTPClient` is exported and documented as 'intended for testing'. Exporting test-only methods on production types is generally discouraged — it pollutes the public API and could be misused. Consider using the `export_test.go` pattern to expose this only during tests, or making the `http` field settable via a constructor option.
}
// SetHTTPClient sets the underlying HTTP client used for requests.
// This is intended for testing to inject mock transports.
func (c *Client) SetHTTPClient(hc *http.Client) {
c.http = hc
}
// PullRequest holds relevant PR metadata.
type PullRequest struct {
Title string `json:"title"`
23
@@ -302,6 +308,20 @@ func redactURL(rawURL string) string {
return parsed.String()
}
Outdated
Review

[NIT] The log field name lastError uses camelCase. slog convention (and the existing slog.Warn call at line 323) uses lowercase snake-case or single-word keys. Recommend "last_error" or "error" for consistency with the other log call in this function which uses "error".

**[NIT]** The log field name `lastError` uses camelCase. slog convention (and the existing `slog.Warn` call at line 323) uses lowercase snake-case or single-word keys. Recommend `"last_error"` or `"error"` for consistency with the other log call in this function which uses `"error"`.
// sanitizeErrorForLog returns a loggable version of an error that omits
// potentially sensitive content like response bodies. For APIError, only
// the status code is included; for other errors, the type is preserved.
func sanitizeErrorForLog(err error) string {
if err == nil {
Outdated
Review

[MINOR] Retry warning logs include the error object (lastError) which will render the API error message (truncated to 200 chars) and could expose portions of server error responses in logs. While bodies are size-limited and URLs are redacted, consider the operational sensitivity of logging server error content at WARN level.

**[MINOR]** Retry warning logs include the error object (lastError) which will render the API error message (truncated to 200 chars) and could expose portions of server error responses in logs. While bodies are size-limited and URLs are redacted, consider the operational sensitivity of logging server error content at WARN level.
return "<nil>"
Review

[MINOR] When isTemporaryNetError returns true and we set lastErr = err and continue, the delay logic in the next iteration uses attempt-1 as the backoff index. However, unlike HTTP 5xx errors where lastErr is always set before the delay block, network errors also set lastErr. The code is correct (lastErr will be non-nil for the log message), but the flow could be clearer — when a network error occurs on attempt 2 (the last), the code falls through to 'return nil, err' (not lastErr). This is actually correct behavior (returns the raw error), but it is inconsistent with the HTTP error path which always returns lastErr. Minor readability issue.

**[MINOR]** When isTemporaryNetError returns true and we set lastErr = err and continue, the delay logic in the next iteration uses attempt-1 as the backoff index. However, unlike HTTP 5xx errors where lastErr is always set before the delay block, network errors also set lastErr. The code is correct (lastErr will be non-nil for the log message), but the flow could be clearer — when a network error occurs on attempt 2 (the last), the code falls through to 'return nil, err' (not lastErr). This is actually correct behavior (returns the raw error), but it is inconsistent with the HTTP error path which always returns lastErr. Minor readability issue.
}
Outdated
Review

[MINOR] Timer leak on zero-delay path: when delay == 0 (i.e., attempt-1 >= len(backoff) or backoff is empty), the code skips the timer block entirely and logs nothing. This is correct behavior. However, the slog.Warn log is only emitted when delay > 0, meaning silent retries happen with no observability when the backoff slice is shorter than maxAttempts-1. Consider logging retries even when delay is zero, or document this intentional behavior in the comment.

**[MINOR]** Timer leak on zero-delay path: when `delay == 0` (i.e., `attempt-1 >= len(backoff)` or `backoff` is empty), the code skips the timer block entirely and logs nothing. This is correct behavior. However, the `slog.Warn` log is only emitted when `delay > 0`, meaning silent retries happen with no observability when the backoff slice is shorter than `maxAttempts-1`. Consider logging retries even when delay is zero, or document this intentional behavior in the comment.
var apiErr *APIError
if errors.As(err, &apiErr) {
return fmt.Sprintf("HTTP %d", apiErr.StatusCode)
}
return err.Error()
}
// doGet performs an HTTP GET request with retry on 5xx errors and temporary
// network errors. Retries up to 3 times with exponential backoff (1s, 2s delays
// by default; configurable via Client.RetryBackoff for testing).
1
@@ -334,7 +354,7 @@ func (c *Client) doGet(ctx context.Context, reqURL string) ([]byte, error) {
"attempt", attempt+1,
Outdated
Review

[NIT] The RetryBackoff field is exported (capital R), which exposes an implementation detail of the retry mechanism as a public API. The doc comment explicitly warns 'Modifying it while requests are in flight is not safe', which means the Client concurrency safety documentation ('A Client is safe for concurrent use by multiple goroutines') is now conditionally true. This is a minor design tension worth noting — the concurrency safety claim in the struct comment should probably be qualified, or RetryBackoff should only be settable before any request is made (which the doc comment does say, but the type system can't enforce it).

**[NIT]** The `RetryBackoff` field is exported (capital R), which exposes an implementation detail of the retry mechanism as a public API. The doc comment explicitly warns 'Modifying it while requests are in flight is not safe', which means the `Client` concurrency safety documentation ('A Client is safe for concurrent use by multiple goroutines') is now conditionally true. This is a minor design tension worth noting — the concurrency safety claim in the struct comment should probably be qualified, or `RetryBackoff` should only be settable before any request is made (which the doc comment does say, but the type system can't enforce it).
"url", redactURL(reqURL),
"delay", delay.String(),
"lastError", lastErr)
"lastError", sanitizeErrorForLog(lastErr))
timer := time.NewTimer(delay)
select {
1
+90 -36
View File
@@ -10,6 +10,7 @@ import (
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"syscall"
"testing"
"time"
6
@@ -891,49 +892,60 @@ func TestDoGet_RespectsContextCancellation(t *testing.T) {
}
}
Review

[MINOR] TestDoGet_RetriesOnTemporaryNetError is a timing-dependent test that explicitly acknowledges it may not verify actual retry behavior ('The request might succeed or fail depending on timing'). It provides almost no guaranteed assertion — attempts could be 0 and the test still passes. This test would be more valuable if it used a controlled mechanism (e.g., an atomic counter with a channel to sequence listener startup) rather than relying on time.Sleep. As-is, it's essentially a no-op assertion that just verifies the code path compiles.

**[MINOR]** `TestDoGet_RetriesOnTemporaryNetError` is a timing-dependent test that explicitly acknowledges it may not verify actual retry behavior ('The request might succeed or fail depending on timing'). It provides almost no guaranteed assertion — `attempts` could be 0 and the test still passes. This test would be more valuable if it used a controlled mechanism (e.g., an atomic counter with a channel to sequence listener startup) rather than relying on `time.Sleep`. As-is, it's essentially a no-op assertion that just verifies the code path compiles.
func TestDoGet_RetriesOnTemporaryNetError(t *testing.T) {
attempts := 0
// Create a listener that we can close to simulate connection refused
ln, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("failed to create listener: %v", err)
// mockTransport is a test helper that returns errors for the first N calls,
// then delegates to a real server.
Outdated
Review

[MINOR] TestDoGet_RetriesOnTemporaryNetError is timing-dependent and makes no deterministic assertions — it explicitly says "actual retry behavior depends on timing". The test comment acknowledges this. While the test does exercise the code path, it provides weak guarantees. A more reliable approach would be to use isTemporaryNetError unit tests (which are present) and a separate test with a mock/stub transport that returns syscall.ECONNREFUSED directly, avoiding the real TCP listener race.

**[MINOR]** `TestDoGet_RetriesOnTemporaryNetError` is timing-dependent and makes no deterministic assertions — it explicitly says "actual retry behavior depends on timing". The test comment acknowledges this. While the test does exercise the code path, it provides weak guarantees. A more reliable approach would be to use `isTemporaryNetError` unit tests (which are present) and a separate test with a mock/stub transport that returns `syscall.ECONNREFUSED` directly, avoiding the real TCP listener race.
type mockTransport struct {
failCount int32 // number of failures remaining (atomic)
failErr error // error to return on failure
realServer *httptest.Server
attemptsMade atomic.Int32 // tracks total attempts
}
func (m *mockTransport) RoundTrip(req *http.Request) (*http.Response, error) {
m.attemptsMade.Add(1)
remaining := atomic.AddInt32(&m.failCount, -1)
Outdated
Review

[NIT] TestDoGet_RetriesOnTemporaryNetError has no meaningful assertions — the comment says 'actual retry behavior depends on timing' and the return values are discarded with _, _. This test exercises the code path but doesn't verify any postcondition, making it closer to a smoke test. It won't cause flakiness but also won't catch regressions. Consider whether it provides sufficient value or should be replaced with a more deterministic approach (e.g., using a custom http.Transport that fails N times then succeeds).

**[NIT]** `TestDoGet_RetriesOnTemporaryNetError` has no meaningful assertions — the comment says 'actual retry behavior depends on timing' and the return values are discarded with `_, _`. This test exercises the code path but doesn't verify any postcondition, making it closer to a smoke test. It won't cause flakiness but also won't catch regressions. Consider whether it provides sufficient value or should be replaced with a more deterministic approach (e.g., using a custom http.Transport that fails N times then succeeds).
if remaining >= 0 {
// Still have failures to return
return nil, m.failErr
}
addr := ln.Addr().String()
// Redirect to real server
req.URL.Host = m.realServer.Listener.Addr().String()
req.URL.Scheme = "http"
return http.DefaultTransport.RoundTrip(req)
}
// Close immediately to cause connection refused on first attempts
ln.Close()
func TestDoGet_RetriesOnTemporaryNetError(t *testing.T) {
// Real server that will handle successful requests
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok"}`))
}))
defer server.Close()
// Start a server after a short delay to succeed on retry
go func() {
time.Sleep(5 * time.Millisecond)
newLn, err := net.Listen("tcp", addr)
if err != nil {
// Port might be reused; not critical for this test
return
}
defer newLn.Close()
// Mock transport: fail twice with ECONNREFUSED, then succeed
mt := &mockTransport{
failCount: 2,
failErr: &net.OpError{Op: "dial", Net: "tcp", Err: syscall.ECONNREFUSED},
realServer: server,
}
for {
conn, err := newLn.Accept()
if err != nil {
return
}
attempts++
// Respond with success
conn.Write([]byte("HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nok"))
conn.Close()
}
}()
client := NewClient("http://fake-host/", "test-token")
client.SetHTTPClient(&http.Client{Transport: mt})
client.RetryBackoff = []time.Duration{1 * time.Millisecond, 1 * time.Millisecond}
client := NewClient("http://"+addr, "test-token")
client.RetryBackoff = []time.Duration{10 * time.Millisecond, 10 * time.Millisecond}
body, err := client.doGet(context.Background(), "http://fake-host/test")
if err != nil {
t.Fatalf("expected success after retries, got error: %v", err)
}
if string(body) != `{"status":"ok"}` {
t.Errorf("body = %q, want %q", string(body), `{"status":"ok"}`)
}
// The request might succeed or fail depending on timing, but the key is
// that we attempt retry on connection refused
_, _ = client.doGet(context.Background(), "http://"+addr+"/test")
// This test verifies the code path exists; actual retry behavior depends on timing
// Should have made exactly 3 attempts: 2 failures + 1 success
if got := mt.attemptsMade.Load(); got != 3 {
t.Errorf("attempts = %d, want 3 (2 failures + 1 success)", got)
}
}
func TestIsTemporaryNetError(t *testing.T) {
@@ -1037,3 +1049,45 @@ func TestRedactURL(t *testing.T) {
})
}
}
func TestSanitizeErrorForLog(t *testing.T) {
tests := []struct {
name string
err error
want string
}{
{
name: "nil error",
err: nil,
want: "<nil>",
},
{
name: "APIError omits body",
err: &APIError{StatusCode: 500, Body: "internal error: database connection failed"},
want: "HTTP 500",
},
{
name: "APIError with large body still only shows status",
err: &APIError{StatusCode: 502, Body: strings.Repeat("x", 1000)},
want: "HTTP 502",
},
{
name: "non-API error preserved",
err: fmt.Errorf("connection refused"),
want: "connection refused",
},
{
name: "wrapped APIError",
err: fmt.Errorf("request failed: %w", &APIError{StatusCode: 503, Body: "service unavailable"}),
want: "HTTP 503",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := sanitizeErrorForLog(tt.err)
if got != tt.want {
t.Errorf("sanitizeErrorForLog() = %q, want %q", got, tt.want)
}
})
}
}