feat(gitea): add retry logic for 5xx errors #69
@@ -15,6 +15,7 @@ import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -56,6 +57,9 @@ type Client struct {
|
||||
// RetryBackoff defines the delays between retry attempts.
|
||||
// RetryBackoff[i] is the delay before attempt i+1 (after attempt i fails).
|
||||
// If nil, defaults to {1s, 2s}. Set to shorter durations in tests.
|
||||
//
|
||||
// This field must be configured before the first request is made.
|
||||
// Modifying it while requests are in flight is not safe.
|
||||
RetryBackoff []time.Duration
|
||||
}
|
||||
|
||||
@@ -223,25 +227,25 @@ func (c *Client) PostReview(ctx context.Context, owner, repo string, number int,
|
||||
}
|
||||
|
||||
// isTemporaryNetError reports whether err is a temporary network error worth retrying.
|
||||
// This includes connection refused, DNS failures, and timeouts that aren't context-based.
|
||||
// This includes connection refused, network unreachable, connection reset, and DNS
|
||||
|
|
||||
// timeouts. It explicitly excludes permanent errors like permission denied or
|
||||
// "no such host" DNS failures.
|
||||
func isTemporaryNetError(err error) bool {
|
||||
if err == nil {
|
||||
|
gpt-review-bot
commented
[MINOR] isTemporaryNetError treats any net.OpError as retriable, which may cause retries on permanent failures (e.g., no such host). Consider narrowing the check (e.g., prefer net.Error timeouts, DNSError.IsTemporary, or specific syscall errors) to avoid unnecessary retries. **[MINOR]** isTemporaryNetError treats any net.OpError as retriable, which may cause retries on permanent failures (e.g., no such host). Consider narrowing the check (e.g., prefer net.Error timeouts, DNSError.IsTemporary, or specific syscall errors) to avoid unnecessary retries.
|
||||
return false
|
||||
}
|
||||
|
security-review-bot marked this conversation as resolved
Outdated
[MINOR] On non-2xx responses, the code reads the entire response body with io.ReadAll and stores it in APIError. This can allow a malicious or misbehaving server to force large allocations. Consider limiting the size for error bodies (e.g., io.LimitReader) since the body is only used for error context. **[MINOR]** On non-2xx responses, the code reads the entire response body with io.ReadAll and stores it in APIError. This can allow a malicious or misbehaving server to force large allocations. Consider limiting the size for error bodies (e.g., io.LimitReader) since the body is only used for error context.
|
||||
|
||||
// Check for common retriable error patterns in the error chain.
|
||||
// Check OpError first since it embeds net.Error, and we want to catch
|
||||
// connection refused, network unreachable, etc. as retriable.
|
||||
// Check for OpError and inspect the underlying syscall error.
|
||||
// Not all OpErrors are transient — permission denied, for example, is permanent.
|
||||
var opErr *net.OpError
|
||||
if errors.As(err, &opErr) {
|
||||
// Connection refused, network unreachable, etc. are typically transient
|
||||
return true
|
||||
return isRetriableSyscallError(opErr.Err)
|
||||
|
sonnet-review-bot
commented
[MINOR] Network/transport errors from **[MINOR]** Network/transport errors from `c.http.Do(req)` (e.g., connection refused, DNS failure) are returned immediately without retry. Transient network errors are often just as worth retrying as 5xx responses. This is a design choice, but worth noting — the PR description only mentions 5xx so this is intentional, but it limits the usefulness of the retry in some failure modes.
|
||||
}
|
||||
|
||||
// DNS errors are often transient
|
||||
// DNS errors: only retry on timeout, not on "no such host" which is permanent.
|
||||
var dnsErr *net.DNSError
|
||||
if errors.As(err, &dnsErr) {
|
||||
return dnsErr.Temporary()
|
||||
return dnsErr.IsTimeout
|
||||
}
|
||||
|
||||
// Check for net.Error with Timeout() (Temporary is deprecated)
|
||||
@@ -253,6 +257,35 @@ func isTemporaryNetError(err error) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// isRetriableSyscallError reports whether the underlying error from a net.OpError
|
||||
|
gpt-review-bot
commented
[NIT] The retry condition duplicates the 5xx check inline. For consistency and reuse, consider using the new IsServerError helper here (errors.As on APIError or check status) to keep logic centralized. **[NIT]** The retry condition duplicates the 5xx check inline. For consistency and reuse, consider using the new IsServerError helper here (errors.As on APIError or check status) to keep logic centralized.
|
||||
// is a transient syscall error worth retrying.
|
||||
func isRetriableSyscallError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check for syscall.Errno directly or wrapped
|
||||
var errno syscall.Errno
|
||||
if errors.As(err, &errno) {
|
||||
switch errno {
|
||||
|
sonnet-review-bot
commented
[NIT] Network errors from **[NIT]** Network errors from `c.http.Do(req)` are returned immediately without retry. Transient network errors (e.g., connection reset, EOF) are just as likely to be transient as 5xx responses. This is a deliberate design choice and acceptable, but worth a comment explaining why only 5xx is retried and not transport errors.
|
||||
case syscall.ECONNREFUSED, // connection refused — server not listening
|
||||
|
gpt-review-bot
commented
[NIT] Retry log 'attempt' field uses attempt+1 during backoff, which may be interpreted ambiguously. Consider logging the retry number explicitly (e.g., 'retry' count) or the next attempt number for clarity. **[NIT]** Retry log 'attempt' field uses attempt+1 during backoff, which may be interpreted ambiguously. Consider logging the retry number explicitly (e.g., 'retry' count) or the next attempt number for clarity.
|
||||
syscall.ECONNRESET, // connection reset by peer
|
||||
syscall.ENETUNREACH, // network unreachable
|
||||
syscall.EHOSTUNREACH, // host unreachable
|
||||
syscall.ETIMEDOUT: // connection timed out
|
||||
return true
|
||||
default:
|
||||
|
sonnet-review-bot
commented
[MINOR] In **[MINOR]** In `isTemporaryNetError`, treating all `*net.OpError` as retriable is overly broad. OpError covers connection refused (transient) but also permission denied and other permanent conditions. A more precise check would inspect `opErr.Err` for syscall errors. That said, the comment acknowledges this and the practical impact on the Gitea use-case is low.
|
||||
// EACCES, EPERM, etc. are permanent — don't retry
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// If we can't identify the specific syscall error, be conservative and retry.
|
||||
|
sonnet-review-bot
commented
[NIT] The **[NIT]** The `GetAllFilesInPath_500Propagates` test (already existing) relies on the server always returning 500, but with retry logic now in place, the test server will be hit 3 times before the error propagates. The test still passes (because the server always returns 500), but test execution time increases by ~3ms (with default 0-delay in tests using `RetryBackoff = nil`). Tests for `GetAllFilesInPath` don't set `RetryBackoff`, so they'll use the default 1s/2s delays — but since `httptest.NewServer` is local, the server succeeds immediately and returns 500, so these tests hit the server 3 times with 1s+2s delays... Actually this could make existing integration-style tests significantly slower. Worth verifying that existing tests (not the new retry tests) that hit 500 responses set short `RetryBackoff`, or accept the 3s slowdown.
|
||||
// This handles wrapped errors or platform-specific error types.
|
||||
// The retry count is limited, so erring on the side of retrying is safe.
|
||||
return true
|
||||
}
|
||||
|
||||
// doGet performs an HTTP GET request with retry on 5xx errors and temporary
|
||||
// network errors. Retries up to 3 times with exponential backoff (1s, 2s delays
|
||||
// by default; configurable via Client.RetryBackoff for testing).
|
||||
@@ -272,7 +305,9 @@ func (c *Client) doGet(ctx context.Context, reqURL string) ([]byte, error) {
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < maxAttempts; attempt++ {
|
||||
if attempt > 0 {
|
||||
|
sonnet-review-bot
commented
[MINOR] The **[MINOR]** The `RetryBackoff` field is exported on `Client`, making it part of the public API surface. This is used as a testing seam, but exporting a mutable slice field on a type documented as 'safe for concurrent use' is a design smell. The field comment itself warns 'Modifying it while requests are in flight is not safe', which contradicts the concurrency safety claim for the type. A cleaner pattern would be to keep it unexported and provide a `WithRetryBackoff([]time.Duration) *Client` constructor option, or accept it only in a constructor. Since it's already exported and the comment addresses the limitation, this is minor but worth noting for future refactors.
|
||||
// Determine delay: use backoff slice if available, otherwise no delay
|
||||
// Determine delay: use backoff slice if available, otherwise retry immediately.
|
||||
// An empty RetryBackoff slice means "retry without delay" — this is intentional
|
||||
// as the caller explicitly configured no delays.
|
||||
|
sonnet-review-bot
commented
[NIT] The log field name **[NIT]** The log field name `lastError` uses camelCase. slog convention (and the existing `slog.Warn` call at line 323) uses lowercase snake-case or single-word keys. Recommend `"last_error"` or `"error"` for consistency with the other log call in this function which uses `"error"`.
|
||||
var delay time.Duration
|
||||
if attempt-1 < len(backoff) {
|
||||
delay = backoff[attempt-1]
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -943,9 +944,20 @@ func TestIsTemporaryNetError(t *testing.T) {
|
||||
}{
|
||||
{"nil error", nil, false},
|
||||
{"plain error", fmt.Errorf("some error"), false},
|
||||
{"OpError", &net.OpError{Op: "dial", Err: fmt.Errorf("connection refused")}, true},
|
||||
{"temporary DNSError", &net.DNSError{IsTemporary: true}, true},
|
||||
{"non-temporary DNSError", &net.DNSError{IsTemporary: false}, false},
|
||||
// OpError with retriable syscall errors
|
||||
{"OpError ECONNREFUSED", &net.OpError{Op: "dial", Err: syscall.ECONNREFUSED}, true},
|
||||
{"OpError ECONNRESET", &net.OpError{Op: "read", Err: syscall.ECONNRESET}, true},
|
||||
{"OpError ENETUNREACH", &net.OpError{Op: "dial", Err: syscall.ENETUNREACH}, true},
|
||||
{"OpError EHOSTUNREACH", &net.OpError{Op: "dial", Err: syscall.EHOSTUNREACH}, true},
|
||||
{"OpError ETIMEDOUT", &net.OpError{Op: "dial", Err: syscall.ETIMEDOUT}, true},
|
||||
// OpError with permanent syscall errors — should NOT retry
|
||||
{"OpError EACCES", &net.OpError{Op: "dial", Err: syscall.EACCES}, false},
|
||||
{"OpError EPERM", &net.OpError{Op: "dial", Err: syscall.EPERM}, false},
|
||||
// OpError with unknown inner error — conservative retry
|
||||
{"OpError unknown inner", &net.OpError{Op: "dial", Err: fmt.Errorf("unknown")}, true},
|
||||
// DNS errors
|
||||
{"DNS timeout", &net.DNSError{IsTimeout: true}, true},
|
||||
{"DNS no such host", &net.DNSError{IsTimeout: false, Name: "bad.host"}, false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
@@ -956,3 +968,30 @@ func TestIsTemporaryNetError(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsRetriableSyscallError(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{"nil", nil, false},
|
||||
{"ECONNREFUSED", syscall.ECONNREFUSED, true},
|
||||
{"ECONNRESET", syscall.ECONNRESET, true},
|
||||
{"ENETUNREACH", syscall.ENETUNREACH, true},
|
||||
{"EHOSTUNREACH", syscall.EHOSTUNREACH, true},
|
||||
{"ETIMEDOUT", syscall.ETIMEDOUT, true},
|
||||
{"EACCES (permanent)", syscall.EACCES, false},
|
||||
{"EPERM (permanent)", syscall.EPERM, false},
|
||||
{"ENOENT (permanent)", syscall.ENOENT, false},
|
||||
{"unknown error", fmt.Errorf("something"), true}, // conservative retry
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := isRetriableSyscallError(tt.err)
|
||||
if got != tt.want {
|
||||
t.Errorf("isRetriableSyscallError(%v) = %v, want %v", tt.err, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
[MINOR] Using time.After in a select with context cancellation can leave a timer to fire after cancellation. Consider using time.NewTimer/backoffTimer := time.NewTimer(d) and deferring backoffTimer.Stop() to avoid unnecessary timers if ctx is canceled before the delay elapses.