// 带缓存的健康检查，每 5 秒实际检查一次，减少探针带来的数据库压力
type cachedHealthChecker struct {
	checkFn     func(ctx context.Context) error
	lastResult  error
	lastCheckAt time.Time
	cacheDur    time.Duration
	mu          sync.RWMutex
}

完整实现

package health

import (
	"context"
	"database/sql"
	"encoding/json"
	"net/http"
	"sync"
	"sync/atomic"
	"time"
)

// Status 服务健康状态
type Status struct {
	Status     string            `json:"status"`  // "ok" or "degraded"
	Components map[string]ComponentStatus `json:"components,omitempty"`
	Timestamp  time.Time         `json:"timestamp"`
}

type ComponentStatus struct {
	Status  string `json:"status"`
	Message string `json:"message,omitempty"`
}

// Checker 健康检查器
type Checker struct {
	db          *sql.DB
	readyFlag   atomic.Bool // 是否就绪（可以手动控制）
	lastDBCheck struct {
		mu     sync.RWMutex
		result error
		at     time.Time
	}
	dbCacheDur time.Duration
}

func NewChecker(db *sql.DB) *Checker {
	c := &Checker{
		db:         db,
		dbCacheDur: 5 * time.Second,
	}
	c.readyFlag.Store(true) // 默认就绪
	return c
}

// SetReady 手动控制就绪状态（优雅停机时调用）
func (c *Checker) SetReady(ready bool) {
	c.readyFlag.Store(ready)
}

// LivenessHandler liveness 探针：只检查进程是否活着
// 不检查任何外部依赖
func (c *Checker) LivenessHandler(w http.ResponseWriter, r *http.Request) {
	// 只要进程能响应请求，就说明它活着
	// 如果进程真的卡死了，这个接口会超时，K8s 会判定探针失败
	w.Header().Set("Content-Type", "application/json")
	w.WriteHeader(http.StatusOK)
	json.NewEncoder(w).Encode(Status{
		Status:    "ok",
		Timestamp: time.Now(),
	})
}

// ReadinessHandler readiness 探针：检查服务是否可以处理请求
// 包括检查外部依赖
func (c *Checker) ReadinessHandler(w http.ResponseWriter, r *http.Request) {
	// 检查手动设置的就绪标志（优雅停机时会设为 false）
	if !c.readyFlag.Load() {
		w.Header().Set("Content-Type", "application/json")
		w.WriteHeader(http.StatusServiceUnavailable)
		json.NewEncoder(w).Encode(Status{
			Status: "degraded",
			Components: map[string]ComponentStatus{
				"app": {Status: "not_ready", Message: "service is shutting down"},
			},
			Timestamp: time.Now(),
		})
		return
	}

	// 检查数据库连通性（带缓存，避免每次探针都查 DB）
	components := make(map[string]ComponentStatus)
	allOK := true

	dbErr := c.checkDBCached(r.Context())
	if dbErr != nil {
		components["database"] = ComponentStatus{
			Status:  "unhealthy",
			Message: dbErr.Error(),
		}
		allOK = false
	} else {
		components["database"] = ComponentStatus{Status: "healthy"}
	}

	status := Status{
		Components: components,
		Timestamp:  time.Now(),
	}

	if allOK {
		status.Status = "ok"
		w.Header().Set("Content-Type", "application/json")
		w.WriteHeader(http.StatusOK)
	} else {
		status.Status = "degraded"
		w.Header().Set("Content-Type", "application/json")
		w.WriteHeader(http.StatusServiceUnavailable)
	}

	json.NewEncoder(w).Encode(status)
}

// StartupHandler startup 探针：检查应用是否完成初始化
// 通常和 readiness 检查相同，或者检查更多初始化项
func (c *Checker) StartupHandler(w http.ResponseWriter, r *http.Request) {
	// startup 探针和 readiness 探针类似
	// 区别在于 K8s 配置层面的超时参数不同
	c.ReadinessHandler(w, r)
}

// checkDBCached 带缓存的数据库检查
func (c *Checker) checkDBCached(ctx context.Context) error {
	c.lastDBCheck.mu.RLock()
	if time.Since(c.lastDBCheck.at) < c.dbCacheDur {
		result := c.lastDBCheck.result
		c.lastDBCheck.mu.RUnlock()
		return result
	}
	c.lastDBCheck.mu.RUnlock()

	// 缓存过期，重新检查
	c.lastDBCheck.mu.Lock()
	defer c.lastDBCheck.mu.Unlock()

	// double-check（可能另一个 goroutine 刚刚更新了）
	if time.Since(c.lastDBCheck.at) < c.dbCacheDur {
		return c.lastDBCheck.result
	}

	checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
	defer cancel()

	c.lastDBCheck.result = c.db.PingContext(checkCtx)
	c.lastDBCheck.at = time.Now()

	return c.lastDBCheck.result
}

// RegisterHandlers 注册健康检查路由
func (c *Checker) RegisterHandlers(mux *http.ServeMux) {
	mux.HandleFunc("/healthz/live", c.LivenessHandler)
	mux.HandleFunc("/healthz/ready", c.ReadinessHandler)
	mux.HandleFunc("/healthz/startup", c.StartupHandler)
}

K8s 配置

livenessProbe:
  httpGet:
    path: /healthz/live
    port: 8080
  initialDelaySeconds: 5
  periodSeconds: 10
  failureThreshold: 3     # 连续3次失败才重启
  timeoutSeconds: 5

readinessProbe:
  httpGet:
    path: /healthz/ready
    port: 8080
  initialDelaySeconds: 5
  periodSeconds: 5
  failureThreshold: 3
  successThreshold: 1
  timeoutSeconds: 5

startupProbe:
  httpGet:
    path: /healthz/startup
    port: 8080
  failureThreshold: 30    # 最多等 30 * 5 = 150 秒完成启动
  periodSeconds: 5

一个小细节：探针路径不要放在鉴权中间件后面

K8s 的探针请求没有 Authorization header，如果健康检查路径被鉴权中间件保护，探针会一直返回 401，导致 Pod 被反复重启或无法接收流量。

健康检查路径要注册在鉴权中间件之前，或者明确豁免这些路径。