From f23623250e6b8f7c3c6d2a151d26d1a1b6012419 Mon Sep 17 00:00:00 2001
From: Aaron Weiker <aweiker@forge.weiker.org>
Date: Thu, 30 Apr 2026 05:31:32 -0700
Subject: [PATCH] docs: add when/when-not to process-design

---
 patterns/process-design.md | 1374 ++++++++++++++++++++++++++++++++++++
 1 file changed, 1374 insertions(+)

diff --git a/patterns/process-design.md b/patterns/process-design.md
index 8add9f5..6c3e41c 100644
--- a/patterns/process-design.md
+++ b/patterns/process-design.md
@@ -26,6 +26,86 @@ Analysis of `lib/elixir/lib/supervisor.ex`, `lib/elixir/lib/dynamic_supervisor.e
 #  children."
 ```
 
+### When to Use
+
+**Triggers:**
+- You know at startup exactly which children need to run (DB pool, PubSub, caches)
+- Children have ordering dependencies (pool must start before consumers)
+- You're building application-level infrastructure in your supervision tree
+
+**Example — before:**
+```elixir
+# Using DynamicSupervisor for fixed infrastructure — wrong tool
+defmodule MyApp.Application do
+  def start(_type, _args) do
+    children = [{DynamicSupervisor, name: MyApp.InfraSupervisor}]
+    Supervisor.start_link(children, strategy: :one_for_one)
+  end
+end
+
+# Manually starting fixed children after supervisor boots
+DynamicSupervisor.start_child(MyApp.InfraSupervisor, MyApp.Repo)
+DynamicSupervisor.start_child(MyApp.InfraSupervisor, MyApp.PubSub)
+DynamicSupervisor.start_child(MyApp.InfraSupervisor, MyApp.Endpoint)
+# No startup ordering guarantee!
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.Application do
+  def start(_type, _args) do
+    children = [
+      MyApp.Repo,        # DB pool starts first
+      MyApp.PubSub,      # PubSub starts after DB is ready
+      MyApp.Endpoint     # Web server starts last
+    ]
+    Supervisor.start_link(children, strategy: :rest_for_one)
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- Children are created on-demand (per-connection, per-request, per-user)
+- The number of children is unbounded or varies significantly at runtime
+- You don't need ordering guarantees between children
+
+**Over-application example:**
+```elixir
+# Static supervisor for per-WebSocket connections — will be clunky
+defmodule MyApp.ConnectionSupervisor do
+  use Supervisor
+
+  def init(_) do
+    # Can't define children at compile time — they arrive at runtime!
+    Supervisor.init([], strategy: :one_for_one)
+  end
+
+  # Awkward: using Supervisor for dynamic children
+  def add_connection(socket) do
+    spec = {ConnectionHandler, socket}
+    Supervisor.start_child(__MODULE__, spec)
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.ConnectionSupervisor do
+  use DynamicSupervisor
+
+  def start_link(_), do: DynamicSupervisor.start_link(__MODULE__, [], name: __MODULE__)
+  def init(_), do: DynamicSupervisor.init(strategy: :one_for_one)
+
+  def add_connection(socket) do
+    DynamicSupervisor.start_child(__MODULE__, {ConnectionHandler, socket})
+  end
+end
+```
+
+**Why:** Static supervisors excel at ordered, fixed infrastructure. DynamicSupervisor excels at scale with runtime-determined children. Pick based on whether children are known at compile time.
+
 ---
 
 ## Pattern 2: PartitionSupervisor for Scalability
@@ -54,6 +134,74 @@ DynamicSupervisor.start_child(
 )
 ```
 
+### When to Use
+
+**Triggers:**
+- A single DynamicSupervisor or Task.Supervisor is a bottleneck under high `start_child` load
+- You're seeing latency spikes when spawning tasks/children under concurrency
+- Profiling shows the supervisor process has a large message queue
+
+**Example — before:**
+```elixir
+# Single supervisor — serializes all spawn operations
+defmodule MyApp.TaskRunner do
+  def run_async(fun) do
+    Task.Supervisor.async_nolink(MyApp.TaskSupervisor, fun)
+  end
+end
+
+# Under 10k concurrent requests, this single process becomes a bottleneck
+```
+
+**Example — after:**
+```elixir
+# Partitioned — distributes load across N supervisor processes
+defmodule MyApp.Application do
+  def start(_type, _args) do
+    children = [
+      {PartitionSupervisor,
+       child_spec: Task.Supervisor,
+       name: MyApp.TaskSupervisors}
+    ]
+    Supervisor.start_link(children, strategy: :one_for_one)
+  end
+end
+
+defmodule MyApp.TaskRunner do
+  def run_async(fun) do
+    Task.Supervisor.async_nolink(
+      {:via, PartitionSupervisor, {MyApp.TaskSupervisors, self()}},
+      fun
+    )
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You have low spawn rates (< 1000/sec) — a single supervisor is fine
+- You need ordering guarantees between children (partitioning breaks ordering)
+- The supervisor has few children total (partitioning adds overhead for no gain)
+
+**Over-application example:**
+```elixir
+# Partitioning a supervisor that starts 5 children at boot — pointless
+{PartitionSupervisor,
+ child_spec: DynamicSupervisor,
+ name: MyApp.ConfigSupervisors,
+ partitions: System.schedulers_online()}
+# 16 partitions for 5 children = massive overhead, zero benefit
+```
+
+**Better alternative:**
+```elixir
+# Just use a plain DynamicSupervisor
+{DynamicSupervisor, name: MyApp.ConfigSupervisor}
+```
+
+**Why:** PartitionSupervisor exists for high-throughput spawn scenarios. If you're not hitting supervisor mailbox limits, the extra processes and routing logic add complexity without benefit.
+
 ---
 
 ## Pattern 3: Supervision Strategies — Choosing the Right Restart Behavior
@@ -81,6 +229,92 @@ Supervisor.start_link(children, strategy: :one_for_all)
 Supervisor.start_link(children, strategy: :rest_for_one)
 ```
 
+### When to Use
+
+**Triggers:**
+- You're deciding how a supervisor should react when one child fails
+- Children share state or resources that become inconsistent if one crashes
+- You have a pipeline: A feeds B feeds C
+
+**Example — before:**
+```elixir
+# Defaulting to :one_for_one without thinking about dependencies
+defmodule MyApp.DataPipeline do
+  use Supervisor
+
+  def init(_) do
+    children = [
+      MyApp.DataSource,    # Produces data
+      MyApp.Transformer,   # Transforms data (holds reference to DataSource)
+      MyApp.Sink           # Writes transformed data
+    ]
+    # If DataSource crashes, Transformer has a stale reference!
+    Supervisor.init(children, strategy: :one_for_one)
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.DataPipeline do
+  use Supervisor
+
+  def init(_) do
+    children = [
+      MyApp.DataSource,    # If this crashes...
+      MyApp.Transformer,   # ...these must restart too (stale refs)
+      MyApp.Sink
+    ]
+    # rest_for_one: crash of DataSource restarts Transformer and Sink
+    Supervisor.init(children, strategy: :rest_for_one)
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- Children are truly independent (HTTP request handlers, job workers)
+- You're using `:one_for_all` because you're unsure — analyze dependencies first
+- The restart strategy masks a design problem (maybe use separate supervision subtrees)
+
+**Over-application example:**
+```elixir
+# one_for_all when children are actually independent
+defmodule MyApp.Workers do
+  use Supervisor
+
+  def init(_) do
+    children = [
+      {MyApp.EmailWorker, []},
+      {MyApp.SMSWorker, []},
+      {MyApp.PushWorker, []}
+    ]
+    # If email crashes, why restart SMS and Push? They're independent!
+    Supervisor.init(children, strategy: :one_for_all)
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.Workers do
+  use Supervisor
+
+  def init(_) do
+    children = [
+      {MyApp.EmailWorker, []},
+      {MyApp.SMSWorker, []},
+      {MyApp.PushWorker, []}
+    ]
+    # Independent workers — one crash doesn't affect others
+    Supervisor.init(children, strategy: :one_for_one)
+  end
+end
+```
+
+**Why:** Restart strategies model dependency graphs. Using `:one_for_all` for independent workers causes unnecessary restarts, losing in-progress work for no benefit.
+
 ---
 
 ## Pattern 4: Restart Intensity (`max_restarts` / `max_seconds`)
@@ -114,6 +348,76 @@ defp add_restart(restarts, now, period) do
 end
 ```
 
+### When to Use
+
+**Triggers:**
+- You want to prevent infinite restart loops from burning CPU
+- You're tuning a supervisor for a child that occasionally crashes under load
+- You need the supervisor to escalate when a systemic problem prevents recovery
+
+**Example — before:**
+```elixir
+# Default: 3 restarts in 5 seconds — might be too aggressive for flaky networks
+defmodule MyApp.ExternalAPISupervisor do
+  use Supervisor
+
+  def init(_) do
+    children = [{MyApp.APIClient, []}]
+    # Default max_restarts: 3, max_seconds: 5
+    # Network blip causes 3 crashes in 2 seconds → supervisor dies → app crashes
+    Supervisor.init(children, strategy: :one_for_one)
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.ExternalAPISupervisor do
+  use Supervisor
+
+  def init(_) do
+    children = [{MyApp.APIClient, []}]
+    # Allow more restarts for transient network issues
+    Supervisor.init(children,
+      strategy: :one_for_one,
+      max_restarts: 10,
+      max_seconds: 60
+    )
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You're setting max_restarts very high to "prevent crashes" — you're hiding bugs
+- The child crash is deterministic (same input = same crash) — fix the bug instead
+- You're relying on restart intensity as a backoff mechanism (use explicit backoff)
+
+**Over-application example:**
+```elixir
+# Setting absurdly high restart limits to "never crash"
+Supervisor.init(children,
+  strategy: :one_for_one,
+  max_restarts: 1000,
+  max_seconds: 1
+)
+# This allows 1000 crashes per second — you'll burn CPU and hide bugs
+```
+
+**Better alternative:**
+```elixir
+# Reasonable limits + fix the underlying crash
+Supervisor.init(children,
+  strategy: :one_for_one,
+  max_restarts: 5,
+  max_seconds: 30
+)
+# If 5 crashes in 30 seconds isn't enough, the problem is the child, not the limit
+```
+
+**Why:** Restart intensity is a circuit breaker, not a throttle. It should escalate systemic failures, not suppress them. If you need aggressive restarts, your child has a bug.
+
 ---
 
 ## Pattern 5: Restart Values — `:permanent` vs `:transient` vs `:temporary`
@@ -145,6 +449,75 @@ end
 use GenServer, restart: :transient
 ```
 
+### When to Use
+
+**Triggers:**
+- You have different process types with different lifecycle expectations
+- One-shot tasks keep restarting and wasting resources
+- A connection should gracefully disconnect without triggering restart
+
+**Example — before:**
+```elixir
+# Everything is :permanent (default) — tasks restart forever
+defmodule MyApp.BatchProcessor do
+  use GenServer
+
+  def handle_cast({:process, batch}, state) do
+    Task.start_link(fn ->
+      process_batch(batch)
+      # Task exits :normal... and gets restarted by supervisor!
+    end)
+    {:noreply, state}
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.BatchTask do
+  use Task, restart: :temporary  # Don't restart completed tasks
+
+  def start_link(batch) do
+    Task.start_link(__MODULE__, :run, [batch])
+  end
+
+  def run(batch), do: process_batch(batch)
+end
+
+defmodule MyApp.ConnectionWorker do
+  use GenServer, restart: :transient  # Restart on crash, not graceful disconnect
+
+  def disconnect(pid) do
+    GenServer.stop(pid, :normal)  # Won't trigger restart
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You're using `:temporary` to avoid fixing a crash (the child just stays dead)
+- You set everything to `:transient` without thinking — `:permanent` is usually right for services
+
+**Over-application example:**
+```elixir
+# Making a critical service :temporary so it "doesn't bother the supervisor"
+defmodule MyApp.PaymentProcessor do
+  use GenServer, restart: :temporary
+  # If this crashes, it stays dead! Payments stop working silently!
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.PaymentProcessor do
+  use GenServer, restart: :permanent
+  # Critical services should always restart — that's the whole point
+end
+```
+
+**Why:** `:permanent` is the safe default for anything that should "always be running." Only use `:transient` for processes that have a valid "done" state, and `:temporary` for truly one-shot work.
+
 ---
 
 ## Pattern 6: Automatic Shutdown for Pipeline Supervisors
@@ -170,6 +543,93 @@ Supervisor.start_link(children,
 )
 ```
 
+### When to Use
+
+**Triggers:**
+- You have a supervisor managing a batch/pipeline where completion means "job done"
+- A supervisor's existence only makes sense while its children are doing work
+- You're building a workflow that should self-terminate
+
+**Example — before:**
+```elixir
+# Manual cleanup when batch workers finish
+defmodule MyApp.BatchSupervisor do
+  use DynamicSupervisor
+
+  def all_done?(supervisor) do
+    # Polling... ugly
+    DynamicSupervisor.count_children(supervisor).active == 0
+  end
+end
+
+# Somewhere else, a monitor process watches and cleans up
+defmodule MyApp.BatchMonitor do
+  use GenServer
+
+  def handle_info(:check, state) do
+    if MyApp.BatchSupervisor.all_done?(state.sup) do
+      Supervisor.stop(state.sup)
+    end
+    {:noreply, state}
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.BatchSupervisor do
+  use Supervisor
+
+  def start_link(tasks) do
+    Supervisor.start_link(__MODULE__, tasks)
+  end
+
+  def init(tasks) do
+    children =
+      Enum.map(tasks, fn task ->
+        Supervisor.child_spec({MyApp.BatchWorker, task},
+          id: task.id, restart: :transient, significant: true)
+      end)
+
+    Supervisor.init(children,
+      strategy: :one_for_one,
+      auto_shutdown: :all_significant
+    )
+  end
+end
+# When all workers complete normally, supervisor shuts down automatically
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- Children are long-lived services that should never "complete"
+- You want the supervisor to keep running even after children exit (for later restarts)
+- Children have `:permanent` restart — they can't be `:significant`
+
+**Over-application example:**
+```elixir
+# auto_shutdown on infrastructure supervisor — it'll die when any child exits!
+children = [
+  Supervisor.child_spec(MyApp.Cache, significant: true, restart: :transient),
+  MyApp.WebServer
+]
+Supervisor.init(children,
+  strategy: :one_for_one,
+  auto_shutdown: :any_significant
+)
+# If Cache restarts as :transient and exits :normal once, the WHOLE supervisor dies
+```
+
+**Better alternative:**
+```elixir
+# Infrastructure supervisors should NOT auto-shutdown
+Supervisor.init(children, strategy: :one_for_one)
+# Only use auto_shutdown for workflow/batch supervisors with finite lifetimes
+```
+
+**Why:** `auto_shutdown` models "this supervisor's job is done when its children finish." It's for finite work, not long-lived services.
+
 ---
 
 ## Pattern 7: Task.async/await for Concurrent Value Computation
@@ -191,6 +651,69 @@ res + Task.await(task)
 
 Key constraint from docs: "If you start an async, you **must await**. This is either done by calling `Task.await/2` or `Task.yield/2` followed by `Task.shutdown/2`."
 
+### When to Use
+
+**Triggers:**
+- You need to compute a value concurrently and use it in the current flow
+- Multiple independent computations can run in parallel to reduce latency
+- The current process should crash if the computation fails (linked failure)
+
+**Example — before:**
+```elixir
+# Sequential — total time = sum of all operations
+def build_dashboard(user_id) do
+  profile = fetch_profile(user_id)        # 200ms
+  orders = fetch_recent_orders(user_id)   # 300ms
+  recommendations = compute_recs(user_id) # 500ms
+  # Total: 1000ms
+
+  %{profile: profile, orders: orders, recommendations: recommendations}
+end
+```
+
+**Example — after:**
+```elixir
+# Concurrent — total time = max of all operations
+def build_dashboard(user_id) do
+  profile_task = Task.async(fn -> fetch_profile(user_id) end)
+  orders_task = Task.async(fn -> fetch_recent_orders(user_id) end)
+  recs_task = Task.async(fn -> compute_recs(user_id) end)
+
+  %{
+    profile: Task.await(profile_task),
+    orders: Task.await(orders_task),
+    recommendations: Task.await(recs_task)
+  }
+  # Total: ~500ms (limited by slowest task)
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- The task might fail and you don't want the caller to crash (use `async_nolink`)
+- You're inside a GenServer and can't block on `await` (use `async_nolink` + `handle_info`)
+- The computation is trivial (< 1ms) — spawning a process adds overhead
+
+**Over-application example:**
+```elixir
+# Spawning a task for trivial work — overhead exceeds benefit
+def format_name(user) do
+  task = Task.async(fn -> String.upcase(user.name) end)
+  Task.await(task)
+  # Process spawn + message passing for a microsecond operation!
+end
+```
+
+**Better alternative:**
+```elixir
+def format_name(user) do
+  String.upcase(user.name)
+end
+```
+
+**Why:** Tasks add process spawn overhead (~2-5μs) plus message passing. Only use them when the work is expensive enough to justify parallelism — typically >1ms or when running multiple operations concurrently.
+
 ---
 
 ## Pattern 8: Task.Supervisor.async_nolink for Fault-Tolerant Task Execution
@@ -230,6 +753,93 @@ defmodule MyApp.Server do
 end
 ```
 
+### When to Use
+
+**Triggers:**
+- A GenServer needs to spawn work that might fail without crashing the server
+- You're building a "request/response with timeout" pattern inside a GenServer
+- External calls (HTTP, DB) from a GenServer should be non-blocking and resilient
+
+**Example — before:**
+```elixir
+defmodule MyApp.Enricher do
+  use GenServer
+
+  @impl true
+  def handle_call({:enrich, data}, _from, state) do
+    # If this HTTP call crashes, the entire GenServer dies!
+    result = Task.async(fn -> HTTPClient.post!("/api/enrich", data) end)
+    enriched = Task.await(result, 5_000)
+    {:reply, {:ok, enriched}, state}
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.Enricher do
+  use GenServer
+
+  @impl true
+  def handle_call({:enrich, data}, from, state) do
+    task = Task.Supervisor.async_nolink(MyApp.TaskSupervisor, fn ->
+      HTTPClient.post!("/api/enrich", data)
+    end)
+    {:noreply, Map.put(state, task.ref, from)}
+  end
+
+  @impl true
+  def handle_info({ref, result}, state) do
+    Process.demonitor(ref, [:flush])
+    {from, state} = Map.pop(state, ref)
+    GenServer.reply(from, {:ok, result})
+    {:noreply, state}
+  end
+
+  @impl true
+  def handle_info({:DOWN, ref, :process, _pid, reason}, state) do
+    {from, state} = Map.pop(state, ref)
+    GenServer.reply(from, {:error, reason})
+    {:noreply, state}
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- Task failure should crash the caller (you WANT linked failure propagation)
+- You're not inside a GenServer and can handle the crash in a try/rescue
+- The work is fast and synchronous is acceptable
+
+**Over-application example:**
+```elixir
+# Using async_nolink for work that SHOULD crash the caller on failure
+defmodule MyApp.CriticalPayment do
+  use GenServer
+
+  def handle_call({:charge, card}, _from, state) do
+    task = Task.Supervisor.async_nolink(MyApp.TaskSupervisor, fn ->
+      PaymentGateway.charge!(card)
+    end)
+    # Now you have to manually handle the failure...
+    # But if payment fails, maybe this GenServer SHOULD crash
+    # to trigger a supervisor restart with clean state
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+# If failure should crash the GenServer, use Task.async (linked)
+def handle_call({:charge, card}, _from, state) do
+  result = Task.async(fn -> PaymentGateway.charge!(card) end)
+  {:reply, Task.await(result), state}
+end
+```
+
+**Why:** `async_nolink` is for resilient, non-critical work. If the task's failure means your GenServer's state is invalid, you want the link — let it crash and restart clean.
+
 ---
 
 ## Pattern 9: Task Supervisor as DynamicSupervisor Specialization
@@ -260,6 +870,68 @@ def init({{_restart, _shutdown} = arg, options}) do
 end
 ```
 
+### When to Use
+
+**Triggers:**
+- You're building task infrastructure that needs proper shutdown and caller tracking
+- You want `async_nolink` + streaming + concurrency limiting for tasks
+- You need tasks to be supervised (restarted, tracked, shut down gracefully)
+
+**Example — before:**
+```elixir
+# Rolling your own task management on DynamicSupervisor
+defmodule MyApp.Workers do
+  def start_task(fun) do
+    spec = %{id: make_ref(), start: {Task, :start_link, [fun]}, restart: :temporary}
+    DynamicSupervisor.start_child(MyApp.WorkerSup, spec)
+  end
+
+  # No caller tracking, no async_nolink, no stream support
+  # Must manually build all of that
+end
+```
+
+**Example — after:**
+```elixir
+# Task.Supervisor gives you all of this for free
+defmodule MyApp.Application do
+  def start(_type, _args) do
+    children = [
+      {Task.Supervisor, name: MyApp.TaskSupervisor}
+    ]
+    Supervisor.start_link(children, strategy: :one_for_one)
+  end
+end
+
+# Now you get: async, async_nolink, async_stream, start_child, etc.
+Task.Supervisor.async_nolink(MyApp.TaskSupervisor, fn -> work() end)
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- Tasks are fire-and-forget and you don't need supervision (just `Task.start/1`)
+- You need custom child specs with complex init logic (use DynamicSupervisor directly)
+- You're spawning non-Task children (GenServers, Agents)
+
+**Over-application example:**
+```elixir
+# Using Task.Supervisor to start GenServers — wrong tool
+Task.Supervisor.start_child(MyApp.TaskSupervisor, fn ->
+  # This spawns a Task that starts a GenServer... awkward
+  {:ok, _} = MyApp.Worker.start_link(args)
+  Process.sleep(:infinity)  # Keep the task alive??
+end)
+```
+
+**Better alternative:**
+```elixir
+# Use DynamicSupervisor for non-Task children
+DynamicSupervisor.start_child(MyApp.WorkerSupervisor, {MyApp.Worker, args})
+```
+
+**Why:** Task.Supervisor is purpose-built for Task processes. It adds caller tracking, `$callers` propagation, and task-specific APIs. For anything that isn't a Task, use DynamicSupervisor.
+
 ---
 
 ## Pattern 10: Registry for Dynamic Process Naming and PubSub
@@ -296,6 +968,87 @@ defp whereis_name(registry, key) do
 end
 ```
 
+### When to Use
+
+**Triggers:**
+- You need to look up processes by a dynamic key without atom leaks
+- You want local PubSub (subscribe/dispatch to topics) without external deps
+- You're building per-entity process pools (per-user, per-room, per-device)
+
+**Example — before:**
+```elixir
+# Custom ETS-based registry with manual cleanup
+defmodule MyApp.ProcessRegistry do
+  def register(key, pid) do
+    ref = Process.monitor(pid)
+    :ets.insert(:registry, {key, pid, ref})
+  end
+
+  def lookup(key) do
+    case :ets.lookup(:registry, key) do
+      [{^key, pid, _ref}] -> {:ok, pid}
+      [] -> :error
+    end
+  end
+
+  # Must handle :DOWN manually to clean up dead entries
+  def handle_info({:DOWN, ref, :process, pid, _reason}, state) do
+    :ets.match_delete(:registry, {:_, pid, ref})
+    {:noreply, state}
+  end
+end
+```
+
+**Example — after:**
+```elixir
+# Registry handles all of this automatically
+# In supervision tree:
+{Registry, keys: :unique, name: MyApp.GameRegistry}
+
+# Registration happens via :via tuple — automatic cleanup on death
+defmodule MyApp.GameSession do
+  use GenServer
+
+  def start_link(game_id) do
+    GenServer.start_link(__MODULE__, game_id,
+      name: {:via, Registry, {MyApp.GameRegistry, game_id}})
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You need distributed/cluster-wide process registration (use Horde, :global, or pg)
+- Process lookup is the hot path and you need sub-microsecond latency (direct PID passing)
+- You have a fixed set of processes known at compile time (atom names are simpler)
+
+**Over-application example:**
+```elixir
+# Using Registry when you could just pass the PID directly
+defmodule MyApp.Pipeline do
+  def process(data) do
+    # Register a process just to look it up one line later...
+    {:ok, pid} = MyApp.Worker.start_link(data)
+    # Why not just use `pid` directly?
+    worker = Registry.lookup(MyApp.Registry, data.id) |> List.first()
+    GenServer.call(elem(worker, 0), :process)
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.Pipeline do
+  def process(data) do
+    {:ok, pid} = MyApp.Worker.start_link(data)
+    GenServer.call(pid, :process)
+  end
+end
+```
+
+**Why:** Registry shines when the looker-upper doesn't know the PID (arrived in a different request, different process tree). If you already have the PID, just use it directly.
+
 ---
 
 ## Pattern 11: Shutdown Semantics — Graceful Termination
@@ -327,6 +1080,77 @@ Workers default to 5000ms. Supervisors default to `:infinity` (to give their chi
 %{shutdown: :infinity, type: :supervisor}
 ```
 
+### When to Use
+
+**Triggers:**
+- You're deploying and need processes to flush buffers, close connections, or deregister
+- Child processes hold external resources that leak if killed immediately
+- Your system has a clean shutdown requirement (compliance, data integrity)
+
+**Example — before:**
+```elixir
+# :brutal_kill on a process that writes to disk — data loss
+defmodule MyApp.WriteAheadLog do
+  use GenServer, shutdown: :brutal_kill  # BAD: loses buffered writes
+
+  @impl true
+  def terminate(_reason, state) do
+    # This never runs with :brutal_kill!
+    flush_buffer_to_disk(state.buffer)
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.WriteAheadLog do
+  use GenServer, shutdown: 10_000  # 10 seconds to flush
+
+  @impl true
+  def terminate(_reason, state) do
+    flush_buffer_to_disk(state.buffer)
+    close_file_handle(state.fd)
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- Setting `:infinity` on worker processes — a bug in `terminate/2` hangs your entire shutdown
+- The process holds no external resources (default 5000ms is fine)
+- You're using `:brutal_kill` on supervisors (they need time to stop their children)
+
+**Over-application example:**
+```elixir
+# :infinity shutdown on a worker — if terminate hangs, deployment hangs
+defmodule MyApp.Worker do
+  use GenServer, shutdown: :infinity
+
+  @impl true
+  def terminate(_reason, state) do
+    # If this HTTP call hangs forever, your entire app can't shut down
+    HTTPClient.post!("/api/deregister", %{id: state.id})
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.Worker do
+  use GenServer, shutdown: 15_000  # Generous but bounded
+
+  @impl true
+  def terminate(_reason, state) do
+    # Use a timeout on the cleanup call too
+    Task.async(fn -> HTTPClient.post("/api/deregister", %{id: state.id}) end)
+    |> Task.yield(10_000)
+  end
+end
+```
+
+**Why:** `:infinity` is safe for supervisors (they're waiting for children) but dangerous for workers. A hung `terminate/2` with infinite shutdown blocks your entire deployment pipeline.
+
 ---
 
 ## Pattern 12: DynamicSupervisor Internal State — Struct with Restart Tracking
@@ -355,6 +1179,78 @@ defstruct [
 ]
 ```
 
+### When to Use
+
+**Triggers:**
+- You need to understand the internal implementation of a supervisor for debugging
+- You're building a custom supervisor-like process
+- You want to understand why DynamicSupervisor uses a map keyed by PID
+
+**Example — before:**
+```elixir
+# Using a list to track children — O(n) on every EXIT message
+defmodule MyApp.CustomSupervisor do
+  use GenServer
+
+  @impl true
+  def init(_) do
+    {:ok, %{children: []}}  # List! Every EXIT scans the whole thing
+  end
+
+  def handle_info({:EXIT, pid, _reason}, state) do
+    # O(n) scan to find and remove the dead child
+    children = Enum.reject(state.children, fn {p, _spec} -> p == pid end)
+    {:noreply, %{state | children: children}}
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.CustomSupervisor do
+  use GenServer
+
+  defstruct children: %{}, restarts: [], max_restarts: 3, max_seconds: 5
+
+  @impl true
+  def init(_) do
+    Process.flag(:trap_exit, true)
+    {:ok, %__MODULE__{}}
+  end
+
+  def handle_info({:EXIT, pid, _reason}, state) do
+    # O(1) lookup and delete
+    {_spec, children} = Map.pop(state.children, pid)
+    {:noreply, %{state | children: children}}
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You're building a production supervisor (use Supervisor/DynamicSupervisor)
+- You don't need custom supervision logic (the standard supervisors cover 99% of cases)
+- You're optimizing prematurely — most apps never have enough children for data structure choice to matter
+
+**Over-application example:**
+```elixir
+# Building a custom supervisor because "I want more control"
+defmodule MyApp.FancySupervisor do
+  use GenServer
+  # 200 lines of restart logic, child tracking, shutdown handling...
+  # Congratulations, you've reimplemented DynamicSupervisor with more bugs
+end
+```
+
+**Better alternative:**
+```elixir
+# Just use the standard one
+{DynamicSupervisor, name: MyApp.FancySupervisor, strategy: :one_for_one}
+```
+
+**Why:** The standard supervisors are battle-tested over decades. Build custom only when you need semantics they don't provide (e.g., priority-based restart, custom backoff).
+
 ---
 
 ## Pattern 13: Restart Logic with Exponential Backoff via `:try_again`
@@ -393,6 +1289,95 @@ defp restart_child(:one_for_one, current_pid, child, state) do
 end
 ```
 
+### When to Use
+
+**Triggers:**
+- A child fails to start due to transient conditions (port conflict, network partition)
+- You're seeing restart intensity limits hit because start failures count as restarts
+- You need a supervisor that tolerates temporary resource unavailability
+
+**Example — before:**
+```elixir
+# Every start failure counts against restart intensity
+# 3 failures in 5 seconds → supervisor crashes → cascading failure
+defmodule MyApp.ConnectionPool do
+  use GenServer
+
+  @impl true
+  def init(config) do
+    # If DB is temporarily unreachable, this crashes...
+    # ...which counts as a restart...
+    # ...which can exhaust restart intensity
+    {:ok, conn} = DBConnection.start_link(config)
+    {:ok, %{conn: conn}}
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.ConnectionPool do
+  use GenServer
+
+  @impl true
+  def init(config) do
+    # Use handle_continue for the connection attempt
+    {:ok, %{config: config, conn: nil}, {:continue, :connect}}
+  end
+
+  @impl true
+  def handle_continue(:connect, state) do
+    case DBConnection.start_link(state.config) do
+      {:ok, conn} ->
+        {:noreply, %{state | conn: conn}}
+      {:error, _reason} ->
+        # Retry after delay without counting against restart intensity
+        Process.send_after(self(), :retry_connect, 5_000)
+        {:noreply, state}
+    end
+  end
+
+  @impl true
+  def handle_info(:retry_connect, state) do
+    {:noreply, state, {:continue, :connect}}
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- The start failure is deterministic (config error, missing module) — fix the bug
+- You're relying on automatic retry to avoid proper health checking
+- The child should NOT start at all if dependencies are unavailable (use `:ignore`)
+
+**Over-application example:**
+```elixir
+# Retrying forever when the error is permanent
+defmodule MyApp.MisconfiguredWorker do
+  @impl true
+  def init(%{api_key: nil}) do
+    # This will NEVER succeed — the key is nil!
+    # Infinite retry just wastes resources
+    Process.send_after(self(), :retry, 5_000)
+    {:ok, %{}}
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.MisconfiguredWorker do
+  @impl true
+  def init(%{api_key: nil}) do
+    # Fail fast on permanent configuration errors
+    {:stop, {:error, :missing_api_key}}
+  end
+end
+```
+
+**Why:** Retry logic is for transient failures (network, resource contention). For permanent errors (bad config, missing deps), fail fast so the operator can fix the actual problem.
+
 ---
 
 ## Pattern 14: `$ancestors` and `$callers` — Process Lineage Tracking
@@ -426,6 +1411,86 @@ def start_link(module, function, args)
 end
 ```
 
+### When to Use
+
+**Triggers:**
+- You're debugging crashes and need to understand where a task was spawned from
+- You're building custom process spawning and want crash logs to show the call chain
+- You need to trace a request through multiple spawned processes
+
+**Example — before:**
+```elixir
+# Custom spawner that loses caller context
+defmodule MyApp.BackgroundJob do
+  def run_async(fun) do
+    spawn_link(fn ->
+      # When this crashes, the log shows no context about WHO spawned it
+      fun.()
+    end)
+  end
+end
+
+# Crash log:
+# [error] Process #PID<0.234.0> raised an exception
+# ** (RuntimeError) something went wrong
+# No idea who called run_async or why!
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.BackgroundJob do
+  def run_async(fun) do
+    owner = self()
+    callers = case Process.get(:"$callers") do
+      [_ | _] = list -> [owner | list]
+      _ -> [owner]
+    end
+
+    spawn_link(fn ->
+      Process.put(:"$callers", callers)
+      fun.()
+    end)
+  end
+end
+
+# Crash log now shows the full caller chain:
+# [error] Process #PID<0.234.0> raised an exception
+# Callers: [#PID<0.200.0>, #PID<0.150.0>]  ← who initiated this work
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You're using Task/Task.Supervisor (they propagate callers automatically)
+- The process is long-lived and the original caller is irrelevant after startup
+- You're spawning processes that outlive their callers (callers list becomes stale)
+
+**Over-application example:**
+```elixir
+# Tracking callers for a permanent GenServer — pointless after init
+defmodule MyApp.Cache do
+  use GenServer
+
+  def start_link(_) do
+    # The "caller" of start_link is the supervisor — not useful for debugging
+    # After boot, the cache serves many callers — the original spawner is irrelevant
+    GenServer.start_link(__MODULE__, [], name: __MODULE__)
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+# For long-lived processes, use Logger.metadata or OpenTelemetry spans
+# to track per-request context, not process lineage
+def handle_call({:get, key}, _from, state) do
+  Logger.metadata(request_id: Logger.metadata()[:request_id])
+  {:reply, Map.get(state, key), state}
+end
+```
+
+**Why:** `$callers` is useful for short-lived spawned work (tasks, one-shot processes). For long-lived services, per-request tracing (metadata, spans) is more appropriate than process lineage.
+
 ---
 
 ## Pattern 15: GenServer.reply/2 for Deferred Responses
@@ -456,6 +1521,87 @@ def handle_info({:reply, from}, state) do
 end
 ```
 
+### When to Use
+
+**Triggers:**
+- A GenServer needs to do async work before replying (DB query, HTTP call, aggregation)
+- You want to reply from a different process than the one that received the request
+- You need to send intermediate progress and then a final response
+
+**Example — before:**
+```elixir
+defmodule MyApp.Aggregator do
+  use GenServer
+
+  @impl true
+  def handle_call(:aggregate, _from, state) do
+    # Blocks the GenServer for potentially seconds
+    # No other calls can be processed during this time
+    result = Enum.reduce(state.sources, %{}, fn source, acc ->
+      data = HTTPClient.get!(source.url).body
+      Map.merge(acc, Jason.decode!(data))
+    end)
+    {:reply, result, state}
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.Aggregator do
+  use GenServer
+
+  @impl true
+  def handle_call(:aggregate, from, state) do
+    # Don't block — spawn the work and reply later
+    Task.Supervisor.async_nolink(MyApp.TaskSupervisor, fn ->
+      result = Enum.reduce(state.sources, %{}, fn source, acc ->
+        data = HTTPClient.get!(source.url).body
+        Map.merge(acc, Jason.decode!(data))
+      end)
+      GenServer.reply(from, result)
+    end)
+    {:noreply, state}
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- The work is fast (< 1ms) — just reply inline
+- You need the reply to be ordered with respect to other calls (deferred replies break ordering)
+- The `from` reference escapes to a long-lived process (it holds a monitor that should be cleaned up)
+
+**Over-application example:**
+```elixir
+# Deferring reply for trivial work — unnecessary complexity
+defmodule MyApp.Counter do
+  use GenServer
+
+  @impl true
+  def handle_call(:get, from, state) do
+    # This is instant! Why defer?
+    Task.start(fn -> GenServer.reply(from, state.count) end)
+    {:noreply, state}
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.Counter do
+  use GenServer
+
+  @impl true
+  def handle_call(:get, _from, state) do
+    {:reply, state.count, state}
+  end
+end
+```
+
+**Why:** `reply/2` enables non-blocking GenServers for expensive operations. For cheap operations, it adds process spawn overhead, potential ordering issues, and code complexity for no benefit.
+
 ---
 
 ## Pattern 16: Process.alias for Safe Request/Response
@@ -483,6 +1629,87 @@ end
 Process.unalias(source_alias)
 ```
 
+### When to Use
+
+**Triggers:**
+- You're building request/response patterns with timeouts where late replies pollute the mailbox
+- A GenServer sends a request and moves on after timeout, but the response arrives later
+- You need safe cancellation of pending responses
+
+**Example — before:**
+```elixir
+defmodule MyApp.RequestRouter do
+  use GenServer
+
+  @impl true
+  def handle_call({:request, payload}, _from, state) do
+    send(state.backend, {:request, self(), payload})
+    receive do
+      {:response, result} -> {:reply, result, state}
+    after
+      5_000 ->
+        # Timeout... but the response might still arrive later!
+        # It'll sit in our mailbox and confuse future receives
+        {:reply, {:error, :timeout}, state}
+    end
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.RequestRouter do
+  use GenServer
+
+  @impl true
+  def handle_call({:request, payload}, _from, state) do
+    alias_ref = Process.alias([:reply])
+    send(state.backend, {:request, alias_ref, payload})
+
+    receive do
+      {^alias_ref, result} -> {:reply, result, state}
+    after
+      5_000 ->
+        # Deactivate the alias — late replies are silently dropped
+        Process.unalias(alias_ref)
+        {:reply, {:error, :timeout}, state}
+    end
+  end
+end
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You're using GenServer.call (it already handles this with its own ref-based protocol)
+- The response will always arrive (no timeout scenario)
+- You're on OTP < 24 (aliases aren't available)
+
+**Over-application example:**
+```elixir
+# Using aliases for GenServer.call — it already handles late replies
+defmodule MyApp.Client do
+  def get_data(server) do
+    alias_ref = Process.alias([:reply])
+    # Pointless — GenServer.call already uses monitor-based protocol
+    # that handles late replies correctly
+    GenServer.call(server, {:get, alias_ref})
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.Client do
+  def get_data(server) do
+    # GenServer.call already handles timeouts and late replies correctly
+    GenServer.call(server, :get, 5_000)
+  end
+end
+```
+
+**Why:** Aliases solve the problem for custom protocols where you build your own request/response. GenServer.call already has equivalent protections built in. Use aliases when you're implementing raw message-based protocols.
+
 ---
 
 ## Pattern 17: Registry Partitioning Strategies
@@ -514,6 +1741,73 @@ Registry.start_link(
 )
 ```
 
+### When to Use
+
+**Triggers:**
+- You have a PubSub with many topics and few subscribers per topic — key lookups are slow
+- Profiling shows Registry.dispatch scanning many partitions for key-based lookups
+- You're choosing between "optimize for subscribe/unsubscribe" vs "optimize for dispatch"
+
+**Example — before:**
+```elixir
+# Default :pid partitioning with many unique keys
+# Each dispatch must scan ALL partitions to find subscribers for a key
+Registry.start_link(keys: :duplicate, name: MyApp.Events)
+
+# With 16 partitions and 100k unique event types,
+# every dispatch scans 16 ETS tables
+Registry.dispatch(MyApp.Events, "order.created", fn entries ->
+  for {pid, _} <- entries, do: send(pid, :notify)
+end)
+```
+
+**Example — after:**
+```elixir
+# Key partitioning — dispatch hits exactly ONE partition per key
+Registry.start_link(
+  keys: {:duplicate, :key},
+  name: MyApp.Events,
+  partitions: System.schedulers_online()
+)
+
+# Now dispatch only scans one ETS table — O(1) partitions
+Registry.dispatch(MyApp.Events, "order.created", fn entries ->
+  for {pid, _} <- entries, do: send(pid, :notify)
+end)
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- You have few keys with many subscribers (`:pid` partitioning is better for cleanup)
+- Process death cleanup is the hot path (`:key` partitioning must scan all partitions on death)
+- You're not hitting performance issues with the default (premature optimization)
+
+**Over-application example:**
+```elixir
+# Key partitioning for a "presence" system where processes die frequently
+# Each death must scan ALL partitions to unregister
+Registry.start_link(
+  keys: {:duplicate, :key},
+  name: MyApp.Presence,
+  partitions: 16
+)
+# With 50k users connecting/disconnecting per second,
+# each disconnect scans 16 partitions — worse than default!
+```
+
+**Better alternative:**
+```elixir
+# Pid partitioning — death cleanup is localized to one partition
+Registry.start_link(
+  keys: :duplicate,
+  name: MyApp.Presence,
+  partitions: System.schedulers_online()
+)
+```
+
+**Why:** Partitioning is a tradeoff. `:key` optimizes dispatch (one partition per lookup) at the cost of death cleanup (scan all). `:pid` optimizes death cleanup (one partition) at the cost of dispatch (scan all). Pick based on which operation is hotter.
+
 ---
 
 ## Pattern 18: `init/1` Return Values — The Full Spectrum
@@ -536,3 +1830,83 @@ Registry.start_link(
 
 **Anti-pattern:** Using `{:stop, reason}` when `:ignore` is appropriate. If a feature is disabled by config, `:ignore` keeps the child spec in the supervisor for later activation. `{:stop, reason}` signals a real failure.
 
+### When to Use
+
+**Triggers:**
+- You need to communicate "don't start this child" without the supervisor treating it as failure
+- A feature is disabled by config but the child spec should remain for hot-enabling
+- A process discovers during init that it's a duplicate and should yield to the existing one
+
+**Example — before:**
+```elixir
+defmodule MyApp.OptionalFeature do
+  use GenServer
+
+  @impl true
+  def init(_) do
+    if Application.get_env(:my_app, :feature_enabled) do
+      {:ok, %{}}
+    else
+      # {:stop, :disabled} causes supervisor to count it as a failure!
+      {:stop, :disabled}
+    end
+  end
+end
+```
+
+**Example — after:**
+```elixir
+defmodule MyApp.OptionalFeature do
+  use GenServer
+
+  @impl true
+  def init(_) do
+    if Application.get_env(:my_app, :feature_enabled) do
+      {:ok, %{}}
+    else
+      # :ignore — supervisor is happy, child spec stays for later activation
+      :ignore
+    end
+  end
+end
+
+# Later, to enable:
+# Update config, then:
+# Supervisor.restart_child(MyApp.Supervisor, MyApp.OptionalFeature)
+```
+
+### When NOT to Use
+
+**Don't use this when:**
+- The failure is real and should count toward restart intensity (use `{:stop, reason}`)
+- You want the supervisor to NOT have a child spec for this module (just don't add it)
+- The process should retry starting later automatically (use `{:stop, _}` + transient restart)
+
+**Over-application example:**
+```elixir
+# Using :ignore for a real failure — hides the problem
+defmodule MyApp.DBConnection do
+  @impl true
+  def init(config) do
+    case connect(config) do
+      {:ok, conn} -> {:ok, conn}
+      {:error, _} -> :ignore  # BAD: DB is down but we pretend everything is fine
+    end
+  end
+end
+```
+
+**Better alternative:**
+```elixir
+defmodule MyApp.DBConnection do
+  @impl true
+  def init(config) do
+    case connect(config) do
+      {:ok, conn} -> {:ok, conn}
+      {:error, reason} -> {:stop, reason}  # Let supervisor handle the failure
+    end
+  end
+end
+```
+
+**Why:** `:ignore` means "this child intentionally should not run right now." `{:stop, reason}` means "this child tried to start and failed." Conflating the two hides real failures from your supervision tree.