From 5d2e5b43c38955d9ec69f7f6c5d8b1ee68a03b3c Mon Sep 17 00:00:00 2001 From: Aaron Weiker Date: Thu, 30 Apr 2026 13:41:33 +0000 Subject: [PATCH] docs: add when/when-not to all Kubernetes patterns --- patterns/patterns.md | 447 ++++++++++++++++++++++++++++ patterns/production-go.md | 607 +++++++++++++++++++++++++++++++++++++- smells/anti-patterns.md | 430 ++++++++++++++++++++++++++- 3 files changed, 1480 insertions(+), 4 deletions(-) diff --git a/patterns/patterns.md b/patterns/patterns.md index 3943507..42fd4d3 100644 --- a/patterns/patterns.md +++ b/patterns/patterns.md @@ -106,6 +106,44 @@ func (c *Scheduler) Reconcile(ctx context.Context, key string) error { } ``` +### When NOT to Use + +**Don't use this when:** +- The operation is truly one-shot with no ongoing state to maintain (e.g., a CLI tool, a migration script) +- You're building a simple request/response service where each request is independent +- The "desired state" changes so frequently that reconciliation would always be stale (use streaming/event-driven instead) + +**Over-application example:** +```go +// Over-engineered: a controller for sending one-time notifications +type NotificationController struct { + queue workqueue.TypedRateLimitingInterface[string] +} + +func (c *NotificationController) Reconcile(ctx context.Context, key string) error { + notification, _ := c.lister.Get(key) + if notification.Status.Sent { + return nil // already sent + } + err := c.sendEmail(notification) + if err == nil { + notification.Status.Sent = true + c.client.Update(ctx, notification) + } + return err +} +``` + +**Better alternative:** +```go +// Simple job processor: one-time work doesn't need continuous reconciliation +func processNotification(ctx context.Context, notification Notification) error { + return sendEmail(notification) +} +``` + +**Why:** The controller pattern adds substantial complexity (informers, caches, workqueue, leader election). If your state doesn't drift — if once you do the thing, it stays done — a simple worker or job queue is far more appropriate. + ### Key Properties 1. **Level-triggered, not edge-triggered** — the sync loop reads current state, not diffs 2. **Idempotent** — running sync twice produces the same result @@ -182,6 +220,75 @@ if !cache.WaitForNamedCacheSyncWithContext(ctx, dc.dListerSynced, dc.rsListerSyn } ``` +### When to Use + +**Triggers:** +- Multiple components in the same process need to read the same Kubernetes resources +- You're building a controller that reacts to state changes but needs fast local reads +- API server load is a concern (you have many controllers or high-frequency reads) + +**Example — before:** +```go +// Every reconcile calls the API server directly — O(controllers × syncs/sec) load +func (c *Controller) Reconcile(ctx context.Context, key string) error { + pod, err := c.client.CoreV1().Pods(ns).Get(ctx, name, metav1.GetOptions{}) + if err != nil { return err } + + nodes, err := c.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { return err } + // ... reconcile +} +``` + +**Example — after:** +```go +// Reads from local cache — zero API server load for reads +func (c *Controller) Reconcile(ctx context.Context, key string) error { + pod, err := c.podLister.Pods(ns).Get(name) // local cache + if err != nil { return err } + + nodes, err := c.nodeLister.List(labels.Everything()) // local cache + if err != nil { return err } + // ... reconcile (only writes hit the API server) +} +``` + +### When NOT to Use + +**Don't use this when:** +- You're building a short-lived CLI tool or one-shot script (informers need time to sync) +- You need strongly consistent reads (informer cache is eventually consistent — may lag by seconds) +- You only access a resource once or twice (the overhead of List+Watch isn't justified) + +**Over-application example:** +```go +// One-shot migration script using informers — overkill +func main() { + factory := informers.NewSharedInformerFactory(client, 0) + podInformer := factory.Core().V1().Pods() + factory.Start(ctx.Done()) + factory.WaitForCacheSync(ctx.Done()) + + pods, _ := podInformer.Lister().List(labels.Everything()) + for _, pod := range pods { + migratePod(pod) + } +} +``` + +**Better alternative:** +```go +// Just list directly — you only need the data once +func main() { + pods, _ := client.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) + for _, pod := range pods.Items { + migratePod(&pod) + } +} +``` + +**Why:** Informers are designed for long-running processes that continuously react to changes. For one-shot reads, a direct List call is simpler, faster to start, and uses less memory. + --- ## 3. Workqueue: Typed Rate-Limiting Queue @@ -242,6 +349,39 @@ if err != nil { queue.Done(key) ``` +### When NOT to Use + +**Don't use this when:** +- Order of processing matters (the workqueue doesn't guarantee FIFO for rate-limited items) +- Each event carries unique payload data that must be processed individually (workqueue only stores keys, not event data) +- You have a single producer and single consumer with no contention (a plain channel suffices) + +**Over-application example:** +```go +// Using workqueue for ordered log processing — wrong tool +queue := workqueue.NewTypedRateLimitingQueue[string](limiter) + +// Each log line is unique and order matters +queue.Add("log-line-1") +queue.Add("log-line-2") +queue.Add("log-line-3") +// Problem: if "log-line-1" fails and gets rate-limited, +// "log-line-2" processes first → out of order +``` + +**Better alternative:** +```go +// Ordered processing needs a simple buffered channel or sequential queue +logs := make(chan LogEntry, 1000) +for entry := range logs { + if err := processLog(entry); err != nil { + retryWithBackoff(entry) // handle retry inline, preserving order + } +} +``` + +**Why:** The workqueue's strength is deduplication and rate-limited retry for key-based reconciliation. If your items are unique (not deduplicate-able) or ordering matters, use a channel or ordered queue instead. + ### The Dirty/Processing Dance ```go @@ -326,6 +466,67 @@ func (dc *DeploymentController) deleteDeployment(logger klog.Logger, obj interfa } ``` +### When to Use + +**Triggers:** +- You're writing a delete event handler for a Kubernetes informer +- Your system uses List+Watch and must handle watch disconnection gracefully +- You need to process all deletions, including those that occurred during downtime + +**Example — before:** +```go +// Naive delete handler — breaks silently during watch reconnection +func (c *Controller) onDelete(obj interface{}) { + pod := obj.(*v1.Pod) // PANIC: obj might be DeletedFinalStateUnknown + c.cleanup(pod) +} +``` + +**Example — after:** +```go +// Tombstone-aware delete handler — handles all cases +func (c *Controller) onDelete(obj interface{}) { + pod, ok := obj.(*v1.Pod) + if !ok { + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + runtime.HandleError(fmt.Errorf("unexpected object type: %T", obj)) + return + } + pod, ok = tombstone.Obj.(*v1.Pod) + if !ok { + runtime.HandleError(fmt.Errorf("tombstone contained non-Pod: %T", tombstone.Obj)) + return + } + } + c.cleanup(pod) +} +``` + +### When NOT to Use + +**Don't use this when:** +- You're not using informers (direct API calls return concrete types, never tombstones) +- Your delete handler only enqueues a key for reconciliation (the reconciler will discover the deletion via a NotFound error from the lister — tombstone handling in the handler is optional) +- You're building a non-Kubernetes system (this is specific to client-go's watch semantics) + +**Over-application example:** +```go +// Tombstone handling in a reconciler that already handles "not found" +func (c *Controller) Reconcile(ctx context.Context, key string) error { + obj, err := c.lister.Get(key) + if errors.IsNotFound(err) { + // Object was deleted — clean up + return c.handleDeletion(key) + } + // No need for tombstone logic here — the lister never returns tombstones +} +``` + +**Better alternative:** Tombstone handling belongs in event handler callbacks (AddFunc/UpdateFunc/DeleteFunc), not in the reconcile loop. The reconciler discovers deletions through "not found" errors from the lister. + +**Why:** Tombstones are an artifact of the event delivery mechanism. If your architecture already handles "object doesn't exist" as a valid reconciliation state, you don't need to explicitly handle tombstones everywhere. + --- ## 5. Controller Expectations Pattern @@ -352,6 +553,79 @@ type ControllerExpectationsInterface interface { } ``` +### When to Use + +**Triggers:** +- Your controller creates or deletes child resources and uses informer cache to count them +- You observe duplicate creates/deletes during rapid reconciliation +- There's a visible lag between your write and the cache reflecting the new state + +**Example — before:** +```go +// Without expectations: reconcile loop creates duplicates +func (c *RSController) Reconcile(ctx context.Context, key string) error { + rs, _ := c.rsLister.Get(key) + pods, _ := c.podLister.Pods(rs.Namespace).List(selector) + + diff := int(*rs.Spec.Replicas) - len(pods) + if diff > 0 { + // Cache hasn't caught up from last reconcile → creates AGAIN + for i := 0; i < diff; i++ { + c.client.CoreV1().Pods(rs.Namespace).Create(ctx, newPod(), ...) + } + } + return nil +} +``` + +**Example — after:** +```go +// With expectations: skip reconcile until cache catches up +func (c *RSController) Reconcile(ctx context.Context, key string) error { + if !c.expectations.SatisfiedExpectations(logger, key) { + return nil // still waiting for previous creates to appear in cache + } + + rs, _ := c.rsLister.Get(key) + pods, _ := c.podLister.Pods(rs.Namespace).List(selector) + + diff := int(*rs.Spec.Replicas) - len(pods) + if diff > 0 { + c.expectations.ExpectCreations(logger, key, diff) + for i := 0; i < diff; i++ { + _, err := c.client.CoreV1().Pods(rs.Namespace).Create(ctx, newPod(), ...) + if err != nil { + c.expectations.CreationObserved(logger, key) // decrement on failure + } + } + } + return nil +} +``` + +### When NOT to Use + +**Don't use this when:** +- Your controller only updates existing resources (no creates/deletes of children) +- You use server-side apply or optimistic concurrency (conflicts are resolved by the API server) +- Your reconciliation is idempotent even with stale cache (e.g., "ensure this ConfigMap has these contents" — creating it twice returns AlreadyExists) + +**Over-application example:** +```go +// Expectations for a controller that only patches status — unnecessary +func (c *StatusController) Reconcile(ctx context.Context, key string) error { + if !c.expectations.SatisfiedExpectations(logger, key) { + return nil // This gate adds nothing — we're only patching, not creating + } + obj, _ := c.lister.Get(key) + return c.client.Status().Patch(ctx, obj, patch) +} +``` + +**Better alternative:** Just patch directly. Status updates via patch are idempotent and don't create duplicate resources. Expectations only matter when create/delete timing could cause over-provisioning. + +**Why:** Expectations add bookkeeping complexity. They solve a specific problem: cache lag causing duplicate creates/deletes. If your controller doesn't create or delete child resources, the problem doesn't exist. + --- ## 6. OwnerReference / Controller Ref Manager Pattern @@ -392,6 +666,72 @@ func (m *BaseControllerRefManager) ClaimObject(ctx context.Context, obj metav1.O } ``` +### When to Use + +**Triggers:** +- Your controller creates child resources that should be garbage-collected when the parent is deleted +- Multiple controllers might "compete" for ownership of the same resources (e.g., label selector changes) +- You need to handle adoption: a resource's owner was deleted but the resource still exists + +**Example — before:** +```go +// Manual cleanup — fragile, misses edge cases +func (c *Controller) deleteParent(ctx context.Context, parent *MyResource) error { + children, _ := c.childLister.List(labelsForParent(parent)) + for _, child := range children { + c.client.Delete(ctx, child.Name, metav1.DeleteOptions{}) + } + return c.client.Delete(ctx, parent.Name, metav1.DeleteOptions{}) + // What if we crash between deleting children and parent? + // What if labels change and we miss some children? +} +``` + +**Example — after:** +```go +// OwnerReference + garbage collector handles cleanup automatically +func (c *Controller) createChild(ctx context.Context, parent *MyResource) error { + child := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(parent, myGVK), + }, + }, + } + _, err := c.client.CoreV1().Pods(parent.Namespace).Create(ctx, child, metav1.CreateOptions{}) + return err + // When parent is deleted, GC automatically deletes all children +} +``` + +### When NOT to Use + +**Don't use this when:** +- Child resources should outlive their parent (e.g., PersistentVolumes that persist after the PVC is deleted) +- Resources are truly independent — no parent/child lifecycle relationship exists +- You're working outside Kubernetes (OwnerReferences are a Kubernetes API concept) + +**Over-application example:** +```go +// Setting owner reference on a shared ConfigMap used by many controllers +func (c *Controller) ensureSharedConfig(ctx context.Context, parent *MyResource) error { + cm := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "shared-config", + OwnerReferences: []metav1.OwnerReference{ + *metav1.NewControllerRef(parent, myGVK), // WRONG: shared resource + }, + }, + } + // Problem: when this parent is deleted, the shared ConfigMap is garbage-collected, + // breaking all other controllers that depend on it +} +``` + +**Better alternative:** Shared resources should not have owner references. Use finalizers on the parent to perform cleanup only of resources that are truly owned, or use labels + a dedicated cleanup controller. + +**Why:** OwnerReferences create a hard lifecycle coupling: parent deletion cascades to children. This is exactly right for "this ReplicaSet owns these Pods" but catastrophic for shared resources. + --- ## 7. Leader Election Pattern @@ -438,6 +778,42 @@ func main() { } ``` +### When NOT to Use + +**Don't use this when:** +- Your workload can be safely sharded across replicas (e.g., each replica handles a different namespace) +- The work is read-only or idempotent at the individual item level (multiple readers are fine) +- You only ever run one replica (single-instance deployment — leader election is pointless overhead) + +**Over-application example:** +```go +// Leader election for a metrics collector that only reads +func main() { + leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ + Lock: resourceLock, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(ctx context.Context) { + collectMetrics(ctx) // reads metrics from nodes — safe to run on all replicas + }, + }, + }) + // Problem: 2 of 3 replicas sit idle, collecting no metrics +} +``` + +**Better alternative:** +```go +// All replicas collect metrics (sharded by node or running in parallel) +func main() { + nodes := getAssignedNodes() // shard assignment via consistent hashing + for _, node := range nodes { + go collectMetricsForNode(ctx, node) + } +} +``` + +**Why:** Leader election serializes all work to one instance. This is correct for writes that would conflict, but wasteful for reads or shardable work. Use it only when concurrent execution would cause correctness problems. + ### Why Controller-manager runs multiple replicas for HA. Only one should reconcile to avoid conflicts. @@ -520,3 +896,74 @@ func init() { runtime.Must(utilfeature.DefaultMutableFeatureGate.AddVersioned(defaultVersionedKubernetesFeatureGates)) } ``` + +### When to Use + +**Triggers:** +- You ship a binary to many environments (clusters, customers) that need different feature sets +- New features are risky and must be progressively rolled out (alpha → beta → GA) +- You need to disable a feature at runtime without redeploying (kill switch) +- Multiple versions of your software coexist, and features must have version-aware behavior + +**Example — before:** +```go +// Compile-time flags or environment variables — no lifecycle, no versioning +func reconcile(ctx context.Context, obj *MyResource) error { + if os.Getenv("ENABLE_NEW_SCHEDULING") == "true" { + return newSchedulingLogic(ctx, obj) + } + return oldSchedulingLogic(ctx, obj) +} +// Problem: no structured lifecycle. How do you deprecate this? When is it safe to remove? +``` + +**Example — after:** +```go +const NewSchedulingAlgorithm featuregate.Feature = "NewSchedulingAlgorithm" + +// Registered with lifecycle metadata +var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ + NewSchedulingAlgorithm: {Default: false, PreRelease: featuregate.Alpha}, // v1.28 + // Later: {Default: true, PreRelease: featuregate.Beta} // v1.30 + // Later: {LockToDefault: true, PreRelease: featuregate.GA} // v1.32 +} + +func reconcile(ctx context.Context, obj *MyResource) error { + if featuregate.DefaultFeatureGate.Enabled(NewSchedulingAlgorithm) { + return newSchedulingLogic(ctx, obj) + } + return oldSchedulingLogic(ctx, obj) +} +``` + +### When NOT to Use + +**Don't use this when:** +- You have a simple application with one deployment target (just use config or env vars) +- The "feature" is actually user-facing configuration (use a config field on the resource, not a gate) +- You can safely always enable the feature (no risk, no need for a kill switch) + +**Over-application example:** +```go +// Feature gate for a trivial log format change — overkill +const JSONLogging featuregate.Feature = "JSONLogging" + +func setupLogger() { + if featuregate.DefaultFeatureGate.Enabled(JSONLogging) { + log.SetFormatter(JSONFormatter{}) + } else { + log.SetFormatter(TextFormatter{}) + } +} +// This never needs alpha/beta/GA lifecycle. It's just config. +``` + +**Better alternative:** +```go +// Simple configuration flag +type Config struct { + LogFormat string `json:"logFormat"` // "json" or "text" +} +``` + +**Why:** Feature gates add cognitive overhead (registry, lifecycle stages, version tracking). They're justified for behavioral changes that carry risk and need graduated rollout. For simple configuration choices with no risk dimension, a config field is clearer and simpler. diff --git a/patterns/production-go.md b/patterns/production-go.md index a418339..e77e320 100644 --- a/patterns/production-go.md +++ b/patterns/production-go.md @@ -83,6 +83,37 @@ type Deployment struct { // Adding a new field? Re-run generator. Zero chance of forgetting. ``` +### When NOT to Use + +**Don't use this when:** +- You have fewer than ~5 types (hand-writing is faster and more readable) +- The boilerplate varies significantly between types (generators work best for uniform patterns) +- The generated code would be harder to debug than hand-written code (e.g., complex business logic) + +**Over-application example:** +```go +// Generating a "ToString" method for 3 types — overkill +//go:generate stringer -type=Status,Priority,Phase + +// You now have 3 generated files, a build dependency on stringer, +// and anyone reading the code has to understand the generation pipeline +// for what amounts to 9 lines of hand-written code. +``` + +**Better alternative:** +```go +// Just write it — 3 types is not "at scale" +func (s Status) String() string { + switch s { + case Running: return "Running" + case Stopped: return "Stopped" + default: return fmt.Sprintf("Status(%d)", s) + } +} +``` + +**Why:** Code generation adds build complexity (Makefiles, CI steps, `go generate` ordering), makes debugging harder (stack traces point to generated code), and obscures logic. It pays off only when the volume of boilerplate is large enough that correctness via automation outweighs these costs. + ### Key Insight **Stdlib has no code generation culture.** stdlib keeps things small enough that hand-writing works. Kubernetes proves that once you cross ~20 types with shared behavior, code gen is the only sane path. @@ -155,6 +186,89 @@ func (s *Scheme) AddKnownTypes(gv schema.GroupVersion, types ...Object) { } ``` +### When to Use + +**Triggers:** +- You have many types that must be serialized/deserialized polymorphically (you read JSON/YAML and don't know the concrete type ahead of time) +- You need version conversion between different representations of the same concept +- Third parties need to register new types without modifying your core code + +**Example — before:** +```go +// Hard-coded switch statement — breaks open/closed principle +func Deserialize(data []byte) (Object, error) { + kind := extractKind(data) + switch kind { + case "Deployment": + var d Deployment + json.Unmarshal(data, &d) + return &d, nil + case "Service": + var s Service + json.Unmarshal(data, &s) + return &s, nil + // ... 50 more cases, each a potential bug + default: + return nil, fmt.Errorf("unknown kind: %s", kind) + } +} +``` + +**Example — after:** +```go +// Type registry — extensible, no switch statements +scheme := runtime.NewScheme() +scheme.AddKnownTypes(appsv1.SchemeGroupVersion, &Deployment{}, &ReplicaSet{}) +scheme.AddKnownTypes(corev1.SchemeGroupVersion, &Service{}, &Pod{}) + +func Deserialize(scheme *runtime.Scheme, data []byte) (Object, error) { + gvk := extractGVK(data) + obj, err := scheme.New(gvk) // creates correct concrete type + if err != nil { return nil, err } + json.Unmarshal(data, obj) + return obj, nil +} +// New types register themselves — no core code changes needed +``` + +### When NOT to Use + +**Don't use this when:** +- You have a small, fixed set of types that won't grow (a switch statement is simpler and fully type-safe at compile time) +- You don't need dynamic/polymorphic deserialization (you always know the concrete type) +- You're not building a plugin system (nobody external needs to register types) + +**Over-application example:** +```go +// Type registry for an internal service with 3 event types — over-engineered +var eventScheme = NewScheme() +func init() { + eventScheme.Register("OrderCreated", reflect.TypeOf(OrderCreated{})) + eventScheme.Register("OrderShipped", reflect.TypeOf(OrderShipped{})) + eventScheme.Register("OrderCanceled", reflect.TypeOf(OrderCanceled{})) +} +// Now debugging requires understanding the registry, reflection, runtime dispatch... +``` + +**Better alternative:** +```go +// Simple interface + switch — readable, debuggable, compile-time safe +type Event interface { EventType() string } + +func handleEvent(data []byte) error { + switch extractType(data) { + case "OrderCreated": + var e OrderCreated + json.Unmarshal(data, &e) + return processCreated(e) + case "OrderShipped": + // ... + } +} +``` + +**Why:** Type registries trade compile-time safety for runtime extensibility. In Kubernetes (50+ types, CRDs, multiple API groups), this tradeoff is essential. In a service with a handful of known types, you're paying the complexity cost (reflection, runtime errors, harder debugging) without the benefit. + ### Key Insight This is Java's ServiceLoader / dependency injection adapted for Go's type system. Stdlib uses interfaces; Kubernetes needs a **runtime type system on top of Go's static type system** because API objects must be dynamically dispatched across version boundaries. @@ -179,6 +293,66 @@ type Object interface { - `GetObjectKind()` — allows the serialization layer to determine what type an object is without reflection - `DeepCopyObject()` — enables safe concurrent access (informer cache is shared; mutations must happen on copies) +### When to Use + +**Triggers:** +- You're building a framework where heterogeneous types flow through a common pipeline (serialization, admission, storage) +- You need to identify the "kind" of an object at runtime without type switches +- Concurrent access requires copy semantics built into the type contract + +**Example — before:** +```go +// No common interface — every handler needs type assertions +func store(obj interface{}) error { + switch v := obj.(type) { + case *Deployment: + return storeDeployment(v) + case *Service: + return storeService(v) + // Every new type requires changes here + } +} +``` + +**Example — after:** +```go +// Common interface — generic pipeline handles any type +func store(obj runtime.Object) error { + gvk := obj.GetObjectKind().GroupVersionKind() + key := buildKey(gvk, obj) + data, _ := serialize(obj) + return etcd.Put(key, data) + // Works for any registered type — no switch statement +} +``` + +### When NOT to Use + +**Don't use this when:** +- Your types don't need polymorphic handling (you always know the concrete type at each call site) +- Deep copy is irrelevant (no shared caches, no concurrent access) +- You're not building framework-level infrastructure (application code rarely needs this) + +**Over-application example:** +```go +// Implementing runtime.Object on application-level domain types +type Invoice struct { ... } +func (i *Invoice) GetObjectKind() schema.ObjectKind { return &i.TypeMeta } +func (i *Invoice) DeepCopyObject() runtime.Object { return i.DeepCopy() } + +// Now your business logic imports k8s.io/apimachinery — for what? +``` + +**Better alternative:** Use a domain-specific interface that captures what your code actually needs: +```go +type Storable interface { + ID() string + Marshal() ([]byte, error) +} +``` + +**Why:** `runtime.Object` is designed for Kubernetes' specific needs (polymorphic serialization across API versions). Adopting it in non-Kubernetes code couples you to a heavy dependency for an abstraction that doesn't match your domain. + ### Key Insight **This is the foundation of Kubernetes' extensibility.** Any Go struct that satisfies these two methods can participate in the entire API machinery — serialization, storage, admission, informers, etc. CRDs generate code that implements this interface. @@ -201,6 +375,60 @@ deployment.Spec.Replicas = ptr.To[int32](3) _, err := client.AppsV1().Deployments(ns).Update(ctx, deployment, metav1.UpdateOptions{}) ``` +### When to Use + +**Triggers:** +- You read from a shared data structure (cache, registry, config store) and need to modify the result +- Multiple goroutines access the same data and at least one modifies it +- You're passing data across ownership boundaries (your function returns data that the caller might mutate) + +**Example — before:** +```go +// Returning a pointer to internal state — caller can corrupt it +func (c *ConfigStore) GetConfig() *Config { + return c.current // caller mutates this → corrupts store +} +``` + +**Example — after:** +```go +// Return a copy — caller can mutate freely +func (c *ConfigStore) GetConfig() *Config { + c.mu.RLock() + defer c.mu.RUnlock() + copy := c.current.DeepCopy() + return copy +} +``` + +### When NOT to Use + +**Don't use this when:** +- The data is owned by a single goroutine (no concurrent access, no shared cache) +- You only need to read the data, never modify it (deep copy for read-only access is wasted allocation) +- Performance is critical and you can prove safety via immutability or other means (e.g., freezing the object after construction) + +**Over-application example:** +```go +// Deep copying on every read, even when only logging +func (c *Controller) logStatus(ctx context.Context, key string) { + obj, _ := c.lister.Get(key) + copy := obj.DeepCopy() // Allocates! But we never mutate copy + logger.Info("status", "phase", copy.Status.Phase) +} +``` + +**Better alternative:** +```go +// Read directly from cache — no mutation, no copy needed +func (c *Controller) logStatus(ctx context.Context, key string) { + obj, _ := c.lister.Get(key) + logger.Info("status", "phase", obj.Status.Phase) // read-only: safe +} +``` + +**Why:** Deep copy allocates memory and does work proportional to object size. In hot paths that only read data, it's pure waste. Copy only when you intend to mutate. + ### Key Insight Stdlib rarely needs deep copy because stdlib objects are typically owned by one goroutine. Kubernetes has a **shared read cache** (the informer store) that necessitates copy-on-write semantics at the application level. @@ -231,6 +459,79 @@ type managerImpl struct { } ``` +### When to Use + +**Triggers:** +- Your system runs workloads with different importance levels (critical infrastructure vs. batch jobs) +- You receive shutdown signals and need to drain gracefully, not crash +- Some work MUST complete (leader lease release, data flush) while other work can be interrupted + +**Example — before:** +```go +// Flat shutdown: everything gets the same 30s, critical work might not finish +func main() { + sig := make(chan os.Signal, 1) + signal.Notify(sig, syscall.SIGTERM) + <-sig + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + server.Shutdown(ctx) // hope 30s is enough for everything +} +``` + +**Example — after:** +```go +// Priority-based shutdown: critical work gets more time +func main() { + sig := make(chan os.Signal, 1) + signal.Notify(sig, syscall.SIGTERM) + <-sig + + // Phase 1: stop accepting new work (immediate) + server.StopAccepting() + + // Phase 2: drain low-priority work (5s budget) + ctx1, cancel1 := context.WithTimeout(context.Background(), 5*time.Second) + batchWorker.Shutdown(ctx1) + cancel1() + + // Phase 3: drain critical work (25s budget) + ctx2, cancel2 := context.WithTimeout(context.Background(), 25*time.Second) + leaderElector.Release(ctx2) + dataStore.Flush(ctx2) + cancel2() +} +``` + +### When NOT to Use + +**Don't use this when:** +- All your work is equally important (flat shutdown with a single timeout is simpler) +- Your process is stateless and restarts are cheap (just die and restart) +- Shutdown time is not constrained (you can wait as long as needed for everything to drain) + +**Over-application example:** +```go +// Priority shutdown for a stateless HTTP server with no background work +type ShutdownManager struct { + priorities []PriorityClass + workers map[PriorityClass][]Worker +} +// All this machinery for... stopping an HTTP handler that has no in-flight state? +``` + +**Better alternative:** +```go +// Simple graceful shutdown — stateless server just drains connections +server := &http.Server{} +go server.ListenAndServe() +<-sigCh +server.Shutdown(ctx) // drains in-flight requests, done +``` + +**Why:** Priority-based shutdown adds ordering logic, timeout budgets per phase, and coordination complexity. It's justified when different subsystems have genuinely different importance. For simple services, `http.Server.Shutdown()` is all you need. + --- ## 6. Context as Logger Carrier @@ -252,6 +553,75 @@ At scale, you need structured logging with: - Verbosity levels (`logger.V(4).Info(...)`) - No global state (context carries the logger configured by the caller) +### When to Use + +**Triggers:** +- You have deep call stacks where log context (request ID, user, controller name) should propagate without explicit parameters +- Multiple subsystems share code but need different logger configurations (verbosity, output format) +- You want to enrich logs with contextual metadata without threading a logger through every function signature + +**Example — before:** +```go +// Global logger: no context, hard to filter, pollutes all output +var log = logrus.New() + +func processOrder(orderID string) error { + log.Info("processing order") // which order? which service? which request? + items := fetchItems(orderID) + log.Infof("fetched %d items", len(items)) // no correlation possible + return nil +} +``` + +**Example — after:** +```go +// Context-carried logger: each request has its own enriched logger +func processOrder(ctx context.Context, orderID string) error { + logger := klog.FromContext(ctx) + logger = logger.WithValues("orderID", orderID) + ctx = klog.NewContext(ctx, logger) + + logger.Info("processing order") // automatically includes orderID + items := fetchItems(ctx, orderID) + // fetchItems logs with the same logger — all lines correlate + return nil +} +``` + +### When NOT to Use + +**Don't use this when:** +- You have a simple CLI tool with sequential execution (a global logger is fine) +- Performance is critical and context allocation is measurable overhead (rare, but possible in hot loops) +- Your logging needs are simple and don't require request-scoped enrichment + +**Over-application example:** +```go +// Context logger in a tight computational loop — unnecessary overhead +func computeHash(ctx context.Context, data []byte) []byte { + logger := klog.FromContext(ctx) // context lookup on every call + logger.V(5).Info("computing hash", "size", len(data)) + // This is called 1M times/sec — the context lookup adds up + return sha256.Sum(data) +} +``` + +**Better alternative:** +```go +// Log once at the boundary, not in the hot loop +func processHashes(ctx context.Context, items [][]byte) [][]byte { + logger := klog.FromContext(ctx) + logger.Info("processing hashes", "count", len(items)) + results := make([][]byte, len(items)) + for i, data := range items { + results[i] = sha256.Sum(data) // no logging in hot path + } + return results +} +``` + +**Why:** Context-based logging is for request/operation scoping, not for instrumenting every function call. In hot paths, the overhead of context lookup and structured log construction matters. + ### Key Insight Stdlib's `log` package is global. Kubernetes uses context-based structured logging (`klog.FromContext`) to allow each call chain to carry its own logger configuration. This enables filtering by controller, verbosity tuning per-component, and correlation. @@ -297,6 +667,67 @@ APIs evolve. Adding a new configuration option shouldn't break callers. Function - Self-documenting (each option is a named function) - Composability (options can be collected and applied conditionally) +### When to Use + +**Triggers:** +- Your constructor has more than 3-4 optional parameters that grow over time +- You're building a library/SDK where backward compatibility matters across versions +- Different callers need different subsets of configuration (not everyone uses every option) + +**Example — before:** +```go +// Constructor with growing parameter list — breaks on every addition +func NewClient(addr string, timeout time.Duration, retries int, tls bool, cert string) *Client +// v2: added auth +func NewClient(addr string, timeout time.Duration, retries int, tls bool, cert string, token string) *Client +// Every caller must update, even if they don't use the new param +``` + +**Example — after:** +```go +func NewClient(addr string, opts ...ClientOption) *Client { + c := &Client{addr: addr, timeout: 30 * time.Second, retries: 3} + for _, opt := range opts { opt(c) } + return c +} + +func WithTimeout(d time.Duration) ClientOption { return func(c *Client) { c.timeout = d } } +func WithRetries(n int) ClientOption { return func(c *Client) { c.retries = n } } +func WithTLS(cert string) ClientOption { return func(c *Client) { c.tls = true; c.cert = cert } } +// Adding WithAuth doesn't break any existing callers +``` + +### When NOT to Use + +**Don't use this when:** +- You have 1-2 required parameters and nothing optional (just use a simple constructor) +- The configuration is static and well-known (a config struct is simpler and more discoverable) +- You're building an internal-only API that you control all callers of (breaking changes are cheap) + +**Over-application example:** +```go +// Functional options for a struct with one optional field +func NewLogger(opts ...LoggerOption) *Logger { + l := &Logger{level: InfoLevel} + for _, opt := range opts { opt(l) } + return l +} +func WithLevel(level Level) LoggerOption { return func(l *Logger) { l.level = level } } + +// Callers write: NewLogger(WithLevel(DebugLevel)) +// When they could write: NewLogger(DebugLevel) — simpler, clearer +``` + +**Better alternative:** +```go +// Simple constructor with the one meaningful parameter +func NewLogger(level Level) *Logger { + return &Logger{level: level} +} +``` + +**Why:** Functional options add indirection (each option is a closure) and reduce discoverability (you need to know which `With*` functions exist). For simple constructors with few parameters, a direct signature is clearer. Use functional options when the option space is large and growing. + --- ## 8. Type-Safe Generics in Critical Paths @@ -329,6 +760,67 @@ type Client[T objectWithMeta] struct { ### Why Before generics, Kubernetes used `interface{}` everywhere, requiring type assertions at every boundary. Generics eliminate entire classes of runtime panics and make the code self-documenting. +### When to Use + +**Triggers:** +- You have container types (queues, caches, pools) that currently use `interface{}`/`any` with type assertions +- Type assertion panics have caused production issues or require defensive coding at every call site +- You're building a library where callers benefit from compile-time type checking + +**Example — before:** +```go +// interface{} queue: type assertions at every boundary +type Queue struct { items []interface{} } +func (q *Queue) Add(item interface{}) { q.items = append(q.items, item) } +func (q *Queue) Get() interface{} { /* ... */ } + +// Caller must assert — runtime panic if wrong type sneaks in +item := queue.Get() +pod := item.(*v1.Pod) // panics if someone added a string +``` + +**Example — after:** +```go +// Generic queue: compile-time safety, no assertions needed +type Queue[T any] struct { items []T } +func (q *Queue[T]) Add(item T) { q.items = append(q.items, item) } +func (q *Queue[T]) Get() T { /* ... */ } + +// Caller gets the right type directly — no assertion, no panic +queue := NewQueue[*v1.Pod]() +pod := queue.Get() // already *v1.Pod at compile time +``` + +### When NOT to Use + +**Don't use this when:** +- You genuinely need heterogeneous collections (multiple unrelated types in one container) +- The generic constraint would be so broad (`any`) that you gain no type safety anyway +- You're adding generics to existing interfaces where all callers work fine with concrete types (generics for generics' sake) + +**Over-application example:** +```go +// Genericizing a function that only ever handles one type +func processItems[T any](items []T, fn func(T) error) error { + for _, item := range items { + if err := fn(item); err != nil { return err } + } + return nil +} +// Every call site: processItems[*Pod](pods, handlePod) +// When you could just write: for _, pod := range pods { handlePod(pod) } +``` + +**Better alternative:** +```go +// If the type is always known and the pattern is trivial, just loop +for _, pod := range pods { + if err := handlePod(pod); err != nil { return err } +} +``` + +**Why:** Generics shine when they eliminate runtime type assertions or enable reusable container/algorithm libraries. They add cognitive overhead (type parameter noise) when the code only ever operates on one concrete type. + ### Key Insight This is a migration pattern: introduce the generic version alongside the deprecated `interface{}` version using type aliases. Callers migrate at their own pace. @@ -384,8 +876,7 @@ func safeGo(ctx context.Context, name string, f func(ctx context.Context)) { go func() { defer func() { if r := recover(); r != nil { - log.Printf("panic in %s: %v -%s", name, r, debug.Stack()) + log.Printf("panic in %s: %v\n%s", name, r, debug.Stack()) // Log, alert, increment metric — but don't kill siblings } }() @@ -401,6 +892,47 @@ func main() { } ``` +### When NOT to Use + +**Don't use this when:** +- The panic indicates truly unrecoverable corruption (out of memory, data corruption) — recovering would hide the bug +- You're in a single-goroutine CLI tool (let it crash with a stack trace — that IS the right behavior) +- The goroutine holds locks or half-modified state that can't be safely abandoned (recovering leaves the system in an inconsistent state) + +**Over-application example:** +```go +// Recovering from panics in code that modifies shared state +func (c *Controller) unsafeSyncWithRecovery(ctx context.Context, key string) { + defer func() { + if r := recover(); r != nil { + log.Printf("recovered: %v", r) + // Problem: c.internalState might be half-modified + // The next sync will read corrupted state + } + }() + c.internalState.Phase = "processing" + riskyOperation() // panics here + c.internalState.Phase = "done" // never reached — state is now wrong +} +``` + +**Better alternative:** +```go +// If you can't guarantee clean recovery, don't recover — crash and restart +func (c *Controller) syncItem(ctx context.Context, key string) error { + // Use error returns, not panics, for expected failures + result, err := riskyOperation() + if err != nil { + return err // retry via workqueue — state is clean + } + c.internalState.Phase = "done" + return nil +} +// HandleCrash at the top level (worker loop) is fine — each sync starts fresh +``` + +**Why:** Panic recovery is safe when each iteration is isolated (starts from clean state, reads from cache). It's dangerous when recovery leaves half-mutated state that subsequent operations will read. + ### Key Insight Stdlib's approach is "let it crash." Kubernetes' approach is "catch it, log it, let the controller retry on the next sync." This is only safe because the controller pattern is idempotent. @@ -437,5 +969,76 @@ func (c channelContext) Err() error { ### Why Kubernetes predates `context.Context` (which arrived in Go 1.7). Millions of lines of code use `stopCh <-chan struct{}`. Rather than a big-bang rewrite, this adapter allows gradual migration. +### When to Use + +**Triggers:** +- You have legacy code using `<-chan struct{}` for cancellation that needs to call modern context-aware APIs +- You're doing a gradual migration from channels to context (not a big-bang rewrite) +- You need to integrate with a library that only accepts context, but your cancellation signal comes from a channel + +**Example — before:** +```go +// Legacy API: only speaks channels +func RunLegacyWorker(stopCh <-chan struct{}) { + for { + select { + case <-stopCh: + return + default: + doWork() + } + } +} + +// New dependency requires context — how to bridge? +func doWork() { + ctx := context.TODO() // wrong: doesn't cancel when stopCh closes + newLibrary.Call(ctx) +} +``` + +**Example — after:** +```go +func RunLegacyWorker(stopCh <-chan struct{}) { + ctx := ContextForChannel(stopCh) // bridge: closes when stopCh closes + for { + select { + case <-ctx.Done(): + return + default: + newLibrary.Call(ctx) // properly cancels when stopCh closes + } + } +} +``` + +### When NOT to Use + +**Don't use this when:** +- You're writing new code (just use `context.Context` from the start — no bridge needed) +- The channel carries data, not just a close signal (this only works for `<-chan struct{}`) +- You need context features beyond cancellation (timeouts, values) — add a real context + +**Over-application example:** +```go +// Using ContextForChannel in new code that already has context +func NewController(ctx context.Context) *Controller { + stopCh := make(chan struct{}) + go func() { <-ctx.Done(); close(stopCh) }() // convert context → channel + bridgedCtx := ContextForChannel(stopCh) // convert channel → context + // You just round-tripped for no reason — use ctx directly +} +``` + +**Better alternative:** +```go +func NewController(ctx context.Context) *Controller { + // Just use ctx — it already does everything you need + return &Controller{ctx: ctx} +} +``` + +**Why:** The bridge pattern exists for migration. In new code, using context directly is simpler, supports timeouts/values, and avoids the indirection of channel↔context conversion. + ### Key Insight **Large codebases can't do breaking API changes atomically.** This bridge pattern is how you evolve from one idiom to another over years without breaking everything at once. diff --git a/smells/anti-patterns.md b/smells/anti-patterns.md index 1efcd4d..8031322 100644 --- a/smells/anti-patterns.md +++ b/smells/anti-patterns.md @@ -18,7 +18,6 @@ deploymentCopy := deployment.DeepCopy() deploymentCopy.Spec.Replicas = ptr.To[int32](3) ``` - ### When to Apply This Rule **Triggers:** @@ -42,6 +41,12 @@ copy.Spec.Replicas = ptr.To[int32](5) client.Update(ctx, copy) ``` +### Exceptions + +- **Read-only access:** If you only read fields and never modify the object, deep copy is unnecessary overhead +- **Single-owner data:** If the struct was created locally and isn't shared (e.g., you just built it with `&Deployment{}`), copying your own data is wasteful +- **Immutable value types:** Primitive fields (string, int, bool) are copied by value in Go — you only need DeepCopy for slices, maps, and pointer fields + **Evidence:** The `runtime.Object` interface *mandates* `DeepCopyObject()`. Every API type has generated deep copy methods. The entire architecture assumes immutable reads. --- @@ -64,6 +69,48 @@ func (q *Typed[T]) Get() (item T, shutdown bool) { } ``` +### When to Apply This Rule + +**Triggers:** +- Multiple goroutines/workers process items from a shared queue or event stream +- Processing involves read-modify-write cycles on external state (API server, database) +- You observe optimistic concurrency conflicts (409 Conflict) or duplicate operations in logs + +**Example — detecting the smell:** +```go +// Multiple workers, no key-level serialization +for i := 0; i < 10; i++ { + go func() { + for key := range eventChannel { // same key can go to any worker + obj, _ := client.Get(key) + obj.Status.Count++ + client.Update(obj) // CONFLICT: another worker updated first + } + }() +} +``` + +**Example — fixed:** +```go +// Workqueue guarantees: one worker per key at a time +queue := workqueue.NewTypedRateLimitingQueue[string](limiter) +for i := 0; i < 10; i++ { + go func() { + for { + key, _ := queue.Get() // key is exclusively ours until Done() + reconcile(key) + queue.Done(key) + } + }() +} +``` + +### Exceptions + +- **Read-only operations:** If workers only read (metrics collection, logging), concurrent access to the same key is safe +- **Truly idempotent writes:** If the write is unconditional (e.g., PUT with a fixed value, not read-modify-write), concurrent processing produces the same result +- **Sharded ownership:** If each key is deterministically routed to exactly one worker (consistent hashing), the queue's built-in serialization is unnecessary + --- ## 3. Never Use Edge-Triggered Logic @@ -74,7 +121,6 @@ func (q *Typed[T]) Get() (item T, shutdown bool) { **The pattern K8s enforces:** Level-triggered reconciliation. The `syncHandler` reads *current state from the cache*, computes *desired state from the spec*, and makes the world match: - ### When to Apply This Rule **Triggers:** @@ -97,6 +143,12 @@ func reconcile(deployment Deployment) { } ``` +### Exceptions + +- **Audit logging / event streams:** Recording "what happened" is inherently edge-triggered — you want to log each event, not reconstruct history from state +- **Notifications:** "User X just logged in" is an event that triggers a one-time notification — level-triggered makes no sense here +- **Ordering-sensitive operations:** If the sequence of events matters (transaction log, command queue), level-triggered reconciliation would lose ordering information + ```go // The sync function always reads current state, never relies on "what happened" func (dc *DeploymentController) syncDeployment(ctx context.Context, key string) error { @@ -136,6 +188,46 @@ func (dc *DeploymentController) handleErr(ctx context.Context, err error, key st } ``` +### When to Apply This Rule + +**Triggers:** +- You're using a rate-limited workqueue and items can both succeed and fail +- You notice items processing slower over time for no apparent reason (accumulated backoff) +- Your error handling only calls `AddRateLimited` on failure but has no `Forget` on success + +**Example — detecting the smell:** +```go +func processItem(queue workqueue.RateLimitingInterface, key string) { + defer queue.Done(key) + err := reconcile(key) + if err != nil { + queue.AddRateLimited(key) // retry with backoff + return + } + // BUG: no queue.Forget(key)! + // Next time this key is processed (even for a new event), + // it gets rate-limited delay from the old failure counter +} +``` + +**Example — fixed:** +```go +func processItem(queue workqueue.RateLimitingInterface, key string) { + defer queue.Done(key) + err := reconcile(key) + if err != nil { + queue.AddRateLimited(key) + return + } + queue.Forget(key) // clear backoff counter on success +} +``` + +### Exceptions + +- **You're using a plain (non-rate-limited) queue:** `Forget()` only matters on `RateLimitingInterface`. Plain `Interface` has no rate limiter to clear. +- **Permanent failures where the key should never be fast-tracked:** If a key represents a permanently broken resource, you might intentionally not `Forget` so that future events for it are naturally throttled (rare — usually you'd remove it from the queue entirely). + --- ## 5. Never Hit the API Server in a Tight Loop @@ -153,6 +245,50 @@ deployment, err := dc.dLister.Deployments(namespace).Get(name) _, err = dc.client.AppsV1().Deployments(namespace).Update(ctx, deployment, ...) ``` +### When to Apply This Rule + +**Triggers:** +- Your reconcile function makes API calls (Get/List) inside a loop +- You observe API server throttling (429 responses) or high request latency +- Multiple controllers in the same process each independently fetch the same resources + +**Example — detecting the smell:** +```go +func (c *Controller) reconcile(ctx context.Context, deployment *apps.Deployment) error { + for i := 0; i < int(*deployment.Spec.Replicas); i++ { + // API call inside a loop — O(replicas) calls per sync + pod, err := c.client.CoreV1().Pods(ns).Get(ctx, podName(i), metav1.GetOptions{}) + if errors.IsNotFound(err) { + c.client.CoreV1().Pods(ns).Create(ctx, newPod(i), metav1.CreateOptions{}) + } + } + return nil +} +``` + +**Example — fixed:** +```go +func (c *Controller) reconcile(ctx context.Context, deployment *apps.Deployment) error { + // Single cache read (local, free) — gets ALL pods at once + existingPods, _ := c.podLister.Pods(ns).List(selectorForDeployment(deployment)) + existingSet := make(map[string]bool) + for _, p := range existingPods { existingSet[p.Name] = true } + + for i := 0; i < int(*deployment.Spec.Replicas); i++ { + if !existingSet[podName(i)] { + c.client.CoreV1().Pods(ns).Create(ctx, newPod(i), metav1.CreateOptions{}) + } + } + return nil +} +``` + +### Exceptions + +- **Writes are unavoidable:** You must hit the API server for creates/updates/deletes — the rule is about *reads*, not writes +- **Cache staleness is unacceptable:** Rare cases where you need a strongly consistent read (e.g., before an irreversible action) justify a direct Get +- **One-shot tools:** CLI commands or migration scripts don't benefit from informer caches (they exit after one operation) + --- ## 6. Never Sync Before Caches Are Warm @@ -170,6 +306,46 @@ if !cache.WaitForNamedCacheSyncWithContext(ctx, } ``` +### When to Apply This Rule + +**Triggers:** +- Your controller uses informer caches (Listers) to determine what actions to take +- You observe a burst of spurious creates/deletes immediately after startup +- Your reconcile logic compares "desired count vs actual count" using cached data + +**Example — detecting the smell:** +```go +func (c *Controller) Run(ctx context.Context) { + // BUG: starts workers immediately — cache might be empty + for i := 0; i < workers; i++ { + go c.worker(ctx) + } + // Informers haven't finished initial List yet + // Worker reads cache → sees 0 pods → creates all replicas again +} +``` + +**Example — fixed:** +```go +func (c *Controller) Run(ctx context.Context) { + // Gate: wait until all caches have completed initial List + if !cache.WaitForCacheSync(ctx.Done(), c.podsSynced, c.rsSynced) { + runtime.HandleError(fmt.Errorf("caches never synced")) + return + } + // Now safe to start workers — cache reflects reality + for i := 0; i < workers; i++ { + go c.worker(ctx) + } +} +``` + +### Exceptions + +- **Read-only controllers:** If your controller only observes and reports (metrics, logging), processing with a partial cache produces incomplete but not harmful results +- **Controllers that check existence before acting:** If your reconcile logic does a direct API Get (not cache read) before creating, the cache warmth gate is less critical (but still good practice) +- **Fast-starting controllers with explicit "not found" handling:** If your reconcile function handles "object not in cache" gracefully (e.g., requeues the key), it may tolerate starting before full sync + --- ## 7. Never Ignore Tombstones in Delete Handlers @@ -195,6 +371,51 @@ func (dc *DeploymentController) deleteDeployment(logger klog.Logger, obj interfa } ``` +### When to Apply This Rule + +**Triggers:** +- You're implementing a `DeleteFunc` event handler for an informer +- Your delete handler type-asserts directly to the concrete type without a fallback +- You observe "leaked" resources that should have been cleaned up after their parent was deleted + +**Example — detecting the smell:** +```go +informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + DeleteFunc: func(obj interface{}) { + pod := obj.(*v1.Pod) // PANIC during watch reconnection + cleanupPod(pod) + }, +}) +``` + +**Example — fixed:** +```go +informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + DeleteFunc: func(obj interface{}) { + pod, ok := obj.(*v1.Pod) + if !ok { + tombstone, ok := obj.(cache.DeletedFinalStateUnknown) + if !ok { + runtime.HandleError(fmt.Errorf("unexpected type %T", obj)) + return + } + pod, ok = tombstone.Obj.(*v1.Pod) + if !ok { + runtime.HandleError(fmt.Errorf("tombstone contained %T", tombstone.Obj)) + return + } + } + cleanupPod(pod) + }, +}) +``` + +### Exceptions + +- **Delete handler only extracts the key:** If your handler only needs `namespace/name` to enqueue work, you can use `cache.MetaNamespaceKeyFunc(obj)` which handles tombstones internally +- **Non-informer event sources:** If your events don't come from client-go informers (e.g., custom message queues), tombstones don't apply +- **Level-triggered reconcilers that don't use delete handlers:** If your reconcile loop discovers deletions via "not found" from the lister, the delete handler is optional anyway + --- ## 8. Never Use ResourceVersion for Equality @@ -216,6 +437,39 @@ func (dc *DeploymentController) updateReplicaSet(logger klog.Logger, old, cur in } ``` +### When to Apply This Rule + +**Triggers:** +- You're comparing ResourceVersions with `<`, `>`, or numeric parsing +- You're storing ResourceVersion as an integer for ordering or pagination +- You're using ResourceVersion to determine "which version is newer" + +**Example — detecting the smell:** +```go +// Treating ResourceVersion as a number — WRONG +rv1, _ := strconv.Atoi(obj1.ResourceVersion) +rv2, _ := strconv.Atoi(obj2.ResourceVersion) +if rv2 > rv1 { + // "obj2 is newer" — this assumption may break in future implementations +} +``` + +**Example — fixed:** +```go +// Only valid comparison: equality (to detect resync vs real update) +if cur.ResourceVersion == old.ResourceVersion { + return // no change — this is a periodic resync event +} +// Don't compare ordering — just process the update +processUpdate(cur) +``` + +### Exceptions + +- **The `== ` check for resync detection:** Comparing ResourceVersion for equality (`==`) to detect "no change" is valid and explicitly how Kubernetes uses it +- **Watch continuity:** Passing ResourceVersion to Watch/List for "resume from" is the intended API — you're not comparing, you're providing a cursor +- **Internal etcd tooling:** If you're operating directly on etcd (not through the Kubernetes API), mod_revision semantics are documented there + --- ## 9. Never Panic in Production Goroutines (Without Recovery) @@ -241,6 +495,41 @@ func BackoffUntilWithContext(ctx context.Context, f func(ctx context.Context), . } ``` +### When to Apply This Rule + +**Triggers:** +- You're launching goroutines without `defer recover()` or `defer HandleCrash()` +- Your process hosts multiple independent subsystems (any one panicking kills all) +- You observe random process deaths in production with nil pointer or index-out-of-range panics + +**Example — detecting the smell:** +```go +// Naked goroutine — one panic kills the whole process +func (c *Controller) Run(ctx context.Context) { + for i := 0; i < 5; i++ { + go c.worker(ctx) // no recovery — nil pointer in worker kills everything + } +} +``` + +**Example — fixed:** +```go +func (c *Controller) Run(ctx context.Context) { + for i := 0; i < 5; i++ { + go func() { + defer utilruntime.HandleCrash() + c.worker(ctx) + }() + } +} +``` + +### Exceptions + +- **Main goroutine of a CLI tool:** Let it panic — the stack trace IS the error report. Recovery here just hides bugs. +- **Panics that indicate programming errors you WANT to crash on:** `panic("unreachable")` or assertion failures during development should crash to catch bugs early. +- **Test code:** Tests should panic/fail loudly. `HandleCrash` in tests would swallow test failures. + --- ## 10. Never Block Workers Indefinitely @@ -257,6 +546,52 @@ defer cancel() _, err := client.CoreV1().Pods(ns).Create(ctx, pod, metav1.CreateOptions{}) ``` +### When to Apply This Rule + +**Triggers:** +- Your sync handler calls external services without a timeout +- Workers are a fixed pool and you observe queue depth growing while workers appear idle +- Your code has `select {}`, unbounded channel reads, or mutex waits without deadline + +**Example — detecting the smell:** +```go +func (c *Controller) syncItem(ctx context.Context, key string) error { + // Calls external service with no timeout — blocks indefinitely if service is down + resp, err := http.Get("http://external-service/api/" + key) + if err != nil { return err } + + // Waits on a channel that might never close + result := <-c.resultChan // blocks forever if producer crashes + return c.updateStatus(ctx, key, result) +} +``` + +**Example — fixed:** +```go +func (c *Controller) syncItem(ctx context.Context, key string) error { + // Bounded timeout on external call + reqCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + req, _ := http.NewRequestWithContext(reqCtx, "GET", "http://external-service/api/"+key, nil) + resp, err := http.DefaultClient.Do(req) + if err != nil { return err } // timeout returns error, worker is freed + + // Bounded wait on channel + select { + case result := <-c.resultChan: + return c.updateStatus(ctx, key, result) + case <-ctx.Done(): + return ctx.Err() // worker freed, key requeued + } +} +``` + +### Exceptions + +- **The worker loop itself:** The top-level `queue.Get()` blocks intentionally when the queue is empty — this is by design (workers sleep until there's work) +- **Graceful shutdown drains:** During shutdown, waiting for in-flight work to complete is acceptable (with a bounded overall shutdown timeout) +- **Leader election acquire:** Blocking until leadership is acquired is intentional — non-leaders should idle + --- ## 11. Never Use sync.Mutex Where sync.Once Suffices @@ -284,6 +619,55 @@ func (m *BaseControllerRefManager) CanAdopt(ctx context.Context) error { } ``` +### When to Apply This Rule + +**Triggers:** +- You see a `sync.Mutex` protecting a `bool` flag that tracks "has this been initialized?" +- The protected code runs exactly once and caches the result +- You see patterns like `mu.Lock(); if !done { doThing(); done = true }; mu.Unlock()` + +**Example — detecting the smell:** +```go +type Manager struct { + mu sync.Mutex + initialized bool + config *Config +} + +func (m *Manager) GetConfig() *Config { + m.mu.Lock() + defer m.mu.Unlock() + if !m.initialized { + m.config = loadConfig() + m.initialized = true + } + return m.config +} +// Bug-prone: what if someone adds code that sets initialized=false? +``` + +**Example — fixed:** +```go +type Manager struct { + initOnce sync.Once + config *Config +} + +func (m *Manager) GetConfig() *Config { + m.initOnce.Do(func() { + m.config = loadConfig() + }) + return m.config + // Impossible to accidentally re-initialize +} +``` + +### Exceptions + +- **Resettable state:** If the "one-time" operation might need to run again (e.g., reconnection after disconnect), `sync.Once` doesn't support reset — use a mutex with a flag +- **Multiple initialization phases:** If initialization has multiple stages that can partially fail and retry, `sync.Once` is too coarse (it only fires once, even on error) +- **Pre-Go 1.21 error handling:** Before Go 1.21's `sync.OnceValue`/`sync.OnceFunc`, handling errors from `Do` required workarounds. Now `sync.OnceValue` cleanly supports fallible initialization. + --- ## 12. Never Expose Mutable State Through Interfaces @@ -294,6 +678,48 @@ func (m *BaseControllerRefManager) CanAdopt(ctx context.Context) error { **The pattern K8s enforces:** Listers return objects from the read-only cache. The `DeepCopy()` pattern ensures mutation safety is the caller's responsibility, not the cache's. +### When to Apply This Rule + +**Triggers:** +- Your getter/accessor returns a pointer to a field that's part of your struct's internal state +- Multiple goroutines call the accessor, and at least one caller might modify the result +- You observe race conditions in `go test -race` pointing to internal fields accessed from outside + +**Example — detecting the smell:** +```go +type Registry struct { + mu sync.RWMutex + items map[string]*Item +} + +func (r *Registry) Get(key string) *Item { + r.mu.RLock() + defer r.mu.RUnlock() + return r.items[key] // returns internal pointer — caller can mutate! +} +// Caller does: item := registry.Get("foo"); item.Name = "bar" +// Now every other caller sees the mutated name — data race +``` + +**Example — fixed:** +```go +func (r *Registry) Get(key string) *Item { + r.mu.RLock() + defer r.mu.RUnlock() + if item, ok := r.items[key]; ok { + return item.DeepCopy() // caller gets their own copy + } + return nil +} +// Caller mutates their copy — registry is unaffected +``` + +### Exceptions + +- **Single-threaded access:** If your struct is only accessed from one goroutine (no concurrency), returning pointers to internals is safe and avoids copy overhead +- **Intentionally shared mutable state:** Some designs explicitly want callers to see mutations (e.g., shared counters, observable state). Document this clearly. +- **Performance-critical read paths:** If deep-copying on every access is too expensive and callers are trusted to not mutate, return a pointer with clear documentation: `// WARNING: do not mutate returned value` + --- ## Summary: The Philosophy