feat(gitea): add retry logic for 5xx errors #69
@@ -11,6 +11,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
@@ -51,6 +52,11 @@ type Client struct {
|
||||
baseURL string
|
||||
token string
|
||||
http *http.Client
|
||||
|
|
||||
|
||||
// RetryBackoff defines the delays between retry attempts.
|
||||
|
sonnet-review-bot
commented
[MINOR] RetryBackoff is an exported field on Client, which exposes a testing/configuration hook as part of the public API surface. Per repository conventions and idiomatic Go (nil-opts pattern), this is usable and documented, but it makes the test-configuration concern part of the permanent API contract. A more idiomatic approach would be an unexported field with a package-internal setter, but the current approach is pragmatic and acceptable given the documented intent. **[MINOR]** RetryBackoff is an exported field on Client, which exposes a testing/configuration hook as part of the public API surface. Per repository conventions and idiomatic Go (nil-opts pattern), this is usable and documented, but it makes the test-configuration concern part of the permanent API contract. A more idiomatic approach would be an unexported field with a package-internal setter, but the current approach is pragmatic and acceptable given the documented intent.
sonnet-review-bot
commented
[MINOR] **[MINOR]** `RetryBackoff` is an exported field on `Client`, which violates the concurrency safety guarantee documented on the type: "A Client is safe for concurrent use by multiple goroutines." The field's own comment acknowledges "Modifying it while requests are in flight is not safe", but this creates an exception to the advertised contract. Consider making it unexported and configuring it only through `NewClient` or a constructor option, or at minimum update the concurrency safety doc comment on `Client` to note the exception.
|
||||
// RetryBackoff[i] is the delay before attempt i+1 (after attempt i fails).
|
||||
// If nil, defaults to {1s, 2s}. Set to shorter durations in tests.
|
||||
RetryBackoff []time.Duration
|
||||
}
|
||||
|
||||
// NewClient creates a new Gitea API client.
|
||||
@@ -215,13 +221,49 @@ func (c *Client) PostReview(ctx context.Context, owner, repo string, number int,
|
||||
}
|
||||
return &review, nil
|
||||
}
|
||||
// doGet performs an HTTP GET request with retry on 5xx errors.
|
||||
// Retries up to 3 times with exponential backoff (1s, 2s delays).
|
||||
|
||||
// isTemporaryNetError reports whether err is a temporary network error worth retrying.
|
||||
|
gpt-review-bot
commented
[NIT] Logging at WARN level on every retry may be noisy for library consumers. Consider lowering to INFO/DEBUG or making retry logging optional/configurable to avoid cluttering logs in normal operation. **[NIT]** Logging at WARN level on every retry may be noisy for library consumers. Consider lowering to INFO/DEBUG or making retry logging optional/configurable to avoid cluttering logs in normal operation.
|
||||
// This includes connection refused, DNS failures, and timeouts that aren't context-based.
|
||||
|
[MINOR] Retry warnings log the full request URL and the last error (which may include server-provided body text). While bodies are truncated and URLs here do not include auth, logging full URLs or server error content can inadvertently leak sensitive query parameters or details if future callers pass sensitive data in query strings. Consider redacting query parameters and limiting error detail in logs. **[MINOR]** Retry warnings log the full request URL and the last error (which may include server-provided body text). While bodies are truncated and URLs here do not include auth, logging full URLs or server error content can inadvertently leak sensitive query parameters or details if future callers pass sensitive data in query strings. Consider redacting query parameters and limiting error detail in logs.
[NIT] isRetriableSyscallError returns true for unknown underlying errors, causing retries even on potentially permanent failures. This is bounded and not a security issue, but could slightly increase request attempts against misconfigured endpoints. Consider restricting retries to known transient error classes. **[NIT]** isRetriableSyscallError returns true for unknown underlying errors, causing retries even on potentially permanent failures. This is bounded and not a security issue, but could slightly increase request attempts against misconfigured endpoints. Consider restricting retries to known transient error classes.
gpt-review-bot
commented
[NIT] Consider adding jitter to the retry backoff to avoid synchronized retries across multiple clients when the server experiences transient issues. **[NIT]** Consider adding jitter to the retry backoff to avoid synchronized retries across multiple clients when the server experiences transient issues.
|
||||
func isTemporaryNetError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
gpt-review-bot
commented
[MINOR] Using time.After in a select with context cancellation can leave a timer to fire after cancellation. Consider using time.NewTimer/backoffTimer := time.NewTimer(d) and deferring backoffTimer.Stop() to avoid unnecessary timers if ctx is canceled before the delay elapses. **[MINOR]** Using time.After in a select with context cancellation can leave a timer to fire after cancellation. Consider using time.NewTimer/backoffTimer := time.NewTimer(d) and deferring backoffTimer.Stop() to avoid unnecessary timers if ctx is canceled before the delay elapses.
|
||||
|
||||
// Check for common retriable error patterns in the error chain.
|
||||
// Check OpError first since it embeds net.Error, and we want to catch
|
||||
// connection refused, network unreachable, etc. as retriable.
|
||||
|
gpt-review-bot
commented
[MINOR] isTemporaryNetError treats any net.OpError as retriable, which may cause retries on permanent failures (e.g., no such host). Consider narrowing the check (e.g., prefer net.Error timeouts, DNSError.IsTemporary, or specific syscall errors) to avoid unnecessary retries. **[MINOR]** isTemporaryNetError treats any net.OpError as retriable, which may cause retries on permanent failures (e.g., no such host). Consider narrowing the check (e.g., prefer net.Error timeouts, DNSError.IsTemporary, or specific syscall errors) to avoid unnecessary retries.
|
||||
var opErr *net.OpError
|
||||
if errors.As(err, &opErr) {
|
||||
|
security-review-bot marked this conversation as resolved
Outdated
[MINOR] On non-2xx responses, the code reads the entire response body with io.ReadAll and stores it in APIError. This can allow a malicious or misbehaving server to force large allocations. Consider limiting the size for error bodies (e.g., io.LimitReader) since the body is only used for error context. **[MINOR]** On non-2xx responses, the code reads the entire response body with io.ReadAll and stores it in APIError. This can allow a malicious or misbehaving server to force large allocations. Consider limiting the size for error bodies (e.g., io.LimitReader) since the body is only used for error context.
|
||||
// Connection refused, network unreachable, etc. are typically transient
|
||||
return true
|
||||
}
|
||||
|
||||
// DNS errors are often transient
|
||||
var dnsErr *net.DNSError
|
||||
|
sonnet-review-bot
commented
[MINOR] Network/transport errors from **[MINOR]** Network/transport errors from `c.http.Do(req)` (e.g., connection refused, DNS failure) are returned immediately without retry. Transient network errors are often just as worth retrying as 5xx responses. This is a design choice, but worth noting — the PR description only mentions 5xx so this is intentional, but it limits the usefulness of the retry in some failure modes.
|
||||
if errors.As(err, &dnsErr) {
|
||||
return dnsErr.Temporary()
|
||||
}
|
||||
|
||||
// Check for net.Error with Timeout() (Temporary is deprecated)
|
||||
var netErr net.Error
|
||||
if errors.As(err, &netErr) {
|
||||
return netErr.Timeout()
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// doGet performs an HTTP GET request with retry on 5xx errors and temporary
|
||||
// network errors. Retries up to 3 times with exponential backoff (1s, 2s delays
|
||||
// by default; configurable via Client.RetryBackoff for testing).
|
||||
func (c *Client) doGet(ctx context.Context, reqURL string) ([]byte, error) {
|
||||
const maxAttempts = 3
|
||||
|
gpt-review-bot
commented
[NIT] The retry condition duplicates the 5xx check inline. For consistency and reuse, consider using the new IsServerError helper here (errors.As on APIError or check status) to keep logic centralized. **[NIT]** The retry condition duplicates the 5xx check inline. For consistency and reuse, consider using the new IsServerError helper here (errors.As on APIError or check status) to keep logic centralized.
|
||||
// backoff[i] is the delay before attempt i+1 (i.e., after attempt i fails).
|
||||
// First attempt (i=0) has no delay; retries wait 1s then 2s.
|
||||
backoff := []time.Duration{1 * time.Second, 2 * time.Second}
|
||||
// First attempt (i=0) has no delay; retries wait 1s then 2s by default.
|
||||
backoff := c.RetryBackoff
|
||||
if backoff == nil {
|
||||
backoff = []time.Duration{1 * time.Second, 2 * time.Second}
|
||||
}
|
||||
|
||||
// maxErrorBodyBytes limits how much of an error response body we read
|
||||
// to protect against malicious servers sending unbounded data.
|
||||
@@ -230,18 +272,26 @@ func (c *Client) doGet(ctx context.Context, reqURL string) ([]byte, error) {
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < maxAttempts; attempt++ {
|
||||
if attempt > 0 {
|
||||
delay := backoff[attempt-1]
|
||||
slog.Warn("retrying request after server error",
|
||||
"attempt", attempt+1,
|
||||
"url", reqURL,
|
||||
"delay", delay.String())
|
||||
// Determine delay: use backoff slice if available, otherwise no delay
|
||||
var delay time.Duration
|
||||
if attempt-1 < len(backoff) {
|
||||
|
sonnet-review-bot
commented
[MINOR] In **[MINOR]** In `isTemporaryNetError`, treating all `*net.OpError` as retriable is overly broad. OpError covers connection refused (transient) but also permission denied and other permanent conditions. A more precise check would inspect `opErr.Err` for syscall errors. That said, the comment acknowledges this and the practical impact on the Gitea use-case is low.
|
||||
delay = backoff[attempt-1]
|
||||
}
|
||||
|
||||
timer := time.NewTimer(delay)
|
||||
select {
|
||||
case <-timer.C:
|
||||
case <-ctx.Done():
|
||||
timer.Stop()
|
||||
return nil, ctx.Err()
|
||||
if delay > 0 {
|
||||
slog.Warn("retrying request after error",
|
||||
"attempt", attempt+1,
|
||||
|
sonnet-review-bot
commented
[NIT] The **[NIT]** The `GetAllFilesInPath_500Propagates` test (already existing) relies on the server always returning 500, but with retry logic now in place, the test server will be hit 3 times before the error propagates. The test still passes (because the server always returns 500), but test execution time increases by ~3ms (with default 0-delay in tests using `RetryBackoff = nil`). Tests for `GetAllFilesInPath` don't set `RetryBackoff`, so they'll use the default 1s/2s delays — but since `httptest.NewServer` is local, the server succeeds immediately and returns 500, so these tests hit the server 3 times with 1s+2s delays... Actually this could make existing integration-style tests significantly slower. Worth verifying that existing tests (not the new retry tests) that hit 500 responses set short `RetryBackoff`, or accept the 3s slowdown.
|
||||
"url", reqURL,
|
||||
"delay", delay.String(),
|
||||
"lastError", lastErr)
|
||||
|
||||
timer := time.NewTimer(delay)
|
||||
select {
|
||||
case <-timer.C:
|
||||
case <-ctx.Done():
|
||||
timer.Stop()
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
sonnet-review-bot
commented
[NIT] The comment **[NIT]** The comment `// backoff[i] is the delay before attempt i+1 (i.e., after attempt i fails).` appears both as an inline comment and in the `RetryBackoff` field doc comment. The duplication is harmless but slightly redundant.
|
||||
|
||||
@@ -253,6 +303,16 @@ func (c *Client) doGet(ctx context.Context, reqURL string) ([]byte, error) {
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
// Check if this is a temporary/transient network error worth retrying.
|
||||
// We only retry if there are attempts remaining.
|
||||
|
sonnet-review-bot
commented
[MINOR] The **[MINOR]** The `RetryBackoff` field is exported on `Client`, making it part of the public API surface. This is used as a testing seam, but exporting a mutable slice field on a type documented as 'safe for concurrent use' is a design smell. The field comment itself warns 'Modifying it while requests are in flight is not safe', which contradicts the concurrency safety claim for the type. A cleaner pattern would be to keep it unexported and provide a `WithRetryBackoff([]time.Duration) *Client` constructor option, or accept it only in a constructor. Since it's already exported and the comment addresses the limitation, this is minor but worth noting for future refactors.
|
||||
if attempt < maxAttempts-1 && isTemporaryNetError(err) {
|
||||
slog.Warn("temporary network error, will retry",
|
||||
"attempt", attempt+1,
|
||||
|
sonnet-review-bot
commented
[NIT] The log field name **[NIT]** The log field name `lastError` uses camelCase. slog convention (and the existing `slog.Warn` call at line 323) uses lowercase snake-case or single-word keys. Recommend `"last_error"` or `"error"` for consistency with the other log call in this function which uses `"error"`.
|
||||
"url", reqURL,
|
||||
"error", err)
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
|
[MINOR] Retry warning logs include the error object (lastError) which will render the API error message (truncated to 200 chars) and could expose portions of server error responses in logs. While bodies are size-limited and URLs are redacted, consider the operational sensitivity of logging server error content at WARN level. **[MINOR]** Retry warning logs include the error object (lastError) which will render the API error message (truncated to 200 chars) and could expose portions of server error responses in logs. While bodies are size-limited and URLs are redacted, consider the operational sensitivity of logging server error content at WARN level.
|
||||
return nil, err
|
||||
|
sonnet-review-bot
commented
[MINOR] When isTemporaryNetError returns true and we set lastErr = err and continue, the delay logic in the next iteration uses attempt-1 as the backoff index. However, unlike HTTP 5xx errors where lastErr is always set before the delay block, network errors also set lastErr. The code is correct (lastErr will be non-nil for the log message), but the flow could be clearer — when a network error occurs on attempt 2 (the last), the code falls through to 'return nil, err' (not lastErr). This is actually correct behavior (returns the raw error), but it is inconsistent with the HTTP error path which always returns lastErr. Minor readability issue. **[MINOR]** When isTemporaryNetError returns true and we set lastErr = err and continue, the delay logic in the next iteration uses attempt-1 as the backoff index. However, unlike HTTP 5xx errors where lastErr is always set before the delay block, network errors also set lastErr. The code is correct (lastErr will be non-nil for the log message), but the flow could be clearer — when a network error occurs on attempt 2 (the last), the code falls through to 'return nil, err' (not lastErr). This is actually correct behavior (returns the raw error), but it is inconsistent with the HTTP error path which always returns lastErr. Minor readability issue.
|
||||
}
|
||||
|
sonnet-review-bot
commented
[MINOR] Timer leak on zero-delay path: when **[MINOR]** Timer leak on zero-delay path: when `delay == 0` (i.e., `attempt-1 >= len(backoff)` or `backoff` is empty), the code skips the timer block entirely and logs nothing. This is correct behavior. However, the `slog.Warn` log is only emitted when `delay > 0`, meaning silent retries happen with no observability when the backoff slice is shorter than `maxAttempts-1`. Consider logging retries even when delay is zero, or document this intentional behavior in the comment.
|
||||
|
||||
@@ -367,9 +427,9 @@ func (c *Client) GetAllFilesInPath(ctx context.Context, owner, repo, path string
|
||||
|
||||
// Review represents a pull request review from the Gitea API.
|
||||
type Review struct {
|
||||
ID int64 `json:"id"`
|
||||
Body string `json:"body"`
|
||||
User struct {
|
||||
ID int64 `json:"id"`
|
||||
Body string `json:"body"`
|
||||
User struct {
|
||||
Login string `json:"login"`
|
||||
} `json:"user"`
|
||||
State string `json:"state"`
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
@@ -585,9 +586,9 @@ func TestGetAllFilesInPath_403Propagates(t *testing.T) {
|
||||
|
||||
func TestIsNotFound(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{"nil error", nil, false},
|
||||
{"non-API error", fmt.Errorf("network timeout"), false},
|
||||
@@ -788,6 +789,9 @@ func TestDoGet_RetriesOn500(t *testing.T) {
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL, "test-token")
|
||||
// Use short backoff for fast tests
|
||||
client.RetryBackoff = []time.Duration{1 * time.Millisecond, 1 * time.Millisecond}
|
||||
|
||||
body, err := client.doGet(context.Background(), server.URL+"/test")
|
||||
if err != nil {
|
||||
t.Fatalf("expected success after retry, got error: %v", err)
|
||||
@@ -810,6 +814,9 @@ func TestDoGet_FailsAfterMaxRetries(t *testing.T) {
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL, "test-token")
|
||||
// Use short backoff for fast tests
|
||||
client.RetryBackoff = []time.Duration{1 * time.Millisecond, 1 * time.Millisecond}
|
||||
|
||||
_, err := client.doGet(context.Background(), server.URL+"/test")
|
||||
if err == nil {
|
||||
t.Fatal("expected error after max retries")
|
||||
@@ -862,19 +869,90 @@ func TestDoGet_RespectsContextCancellation(t *testing.T) {
|
||||
defer server.Close()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
// Cancel immediately after first attempt would trigger retry
|
||||
|
||||
client := NewClient(server.URL, "test-token")
|
||||
// Use longer backoff to give us time to cancel during the wait
|
||||
client.RetryBackoff = []time.Duration{100 * time.Millisecond, 100 * time.Millisecond}
|
||||
|
||||
// Cancel after first attempt returns and retry begins
|
||||
go func() {
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
client := NewClient(server.URL, "test-token")
|
||||
_, err := client.doGet(ctx, server.URL+"/test")
|
||||
if err == nil {
|
||||
t.Fatal("expected error on context cancellation")
|
||||
}
|
||||
// Should have made 1 attempt, then context cancelled during backoff
|
||||
if attempts > 2 {
|
||||
t.Errorf("attempts = %d, expected at most 2 before context cancel", attempts)
|
||||
if attempts != 1 {
|
||||
t.Errorf("attempts = %d, expected 1 before context cancel during backoff", attempts)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDoGet_RetriesOnTemporaryNetError(t *testing.T) {
|
||||
|
sonnet-review-bot
commented
[MINOR] **[MINOR]** `TestDoGet_RetriesOnTemporaryNetError` is a timing-dependent test that explicitly acknowledges it may not verify actual retry behavior ('The request might succeed or fail depending on timing'). It provides almost no guaranteed assertion — `attempts` could be 0 and the test still passes. This test would be more valuable if it used a controlled mechanism (e.g., an atomic counter with a channel to sequence listener startup) rather than relying on `time.Sleep`. As-is, it's essentially a no-op assertion that just verifies the code path compiles.
|
||||
attempts := 0
|
||||
|
||||
// Create a listener that we can close to simulate connection refused
|
||||
ln, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
|
sonnet-review-bot
commented
[MINOR] **[MINOR]** `TestDoGet_RetriesOnTemporaryNetError` is timing-dependent and makes no deterministic assertions — it explicitly says "actual retry behavior depends on timing". The test comment acknowledges this. While the test does exercise the code path, it provides weak guarantees. A more reliable approach would be to use `isTemporaryNetError` unit tests (which are present) and a separate test with a mock/stub transport that returns `syscall.ECONNREFUSED` directly, avoiding the real TCP listener race.
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create listener: %v", err)
|
||||
}
|
||||
addr := ln.Addr().String()
|
||||
|
||||
// Close immediately to cause connection refused on first attempts
|
||||
ln.Close()
|
||||
|
||||
// Start a server after a short delay to succeed on retry
|
||||
go func() {
|
||||
|
sonnet-review-bot
commented
[NIT] **[NIT]** `TestDoGet_RetriesOnTemporaryNetError` has no meaningful assertions — the comment says 'actual retry behavior depends on timing' and the return values are discarded with `_, _`. This test exercises the code path but doesn't verify any postcondition, making it closer to a smoke test. It won't cause flakiness but also won't catch regressions. Consider whether it provides sufficient value or should be replaced with a more deterministic approach (e.g., using a custom http.Transport that fails N times then succeeds).
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
newLn, err := net.Listen("tcp", addr)
|
||||
if err != nil {
|
||||
// Port might be reused; not critical for this test
|
||||
return
|
||||
}
|
||||
defer newLn.Close()
|
||||
|
||||
for {
|
||||
conn, err := newLn.Accept()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
attempts++
|
||||
// Respond with success
|
||||
conn.Write([]byte("HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nok"))
|
||||
conn.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
client := NewClient("http://"+addr, "test-token")
|
||||
client.RetryBackoff = []time.Duration{10 * time.Millisecond, 10 * time.Millisecond}
|
||||
|
||||
// The request might succeed or fail depending on timing, but the key is
|
||||
// that we attempt retry on connection refused
|
||||
_, _ = client.doGet(context.Background(), "http://"+addr+"/test")
|
||||
|
||||
// This test verifies the code path exists; actual retry behavior depends on timing
|
||||
}
|
||||
|
||||
func TestIsTemporaryNetError(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{"nil error", nil, false},
|
||||
{"plain error", fmt.Errorf("some error"), false},
|
||||
{"OpError", &net.OpError{Op: "dial", Err: fmt.Errorf("connection refused")}, true},
|
||||
{"temporary DNSError", &net.DNSError{IsTemporary: true}, true},
|
||||
{"non-temporary DNSError", &net.DNSError{IsTemporary: false}, false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := isTemporaryNetError(tt.err)
|
||||
if got != tt.want {
|
||||
t.Errorf("isTemporaryNetError(%v) = %v, want %v", tt.err, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
[MINOR] RetryBackoff is an exported field on Client, which is documented as 'safe for concurrent use by multiple goroutines'. However, the comment on RetryBackoff says 'Modifying it while requests are in flight is not safe.' This creates an inconsistency: the type-level doc says concurrent-safe, but this field is an exception. Consider either making RetryBackoff unexported and configuring it only through a constructor option (e.g., a WithRetryBackoff option function or NewClient parameter), or at minimum adding a prominent warning at the type level that RetryBackoff must be configured before first use.