Go Prometheus 指标暴露实战——自定义 Counter、Gauge、Histogram 完整方案

老张2026/4/30大约 7 分钟

Go Prometheus 指标暴露实战——自定义 Counter、Gauge、Histogram 完整方案

适读人群：想给 Go 服务接入监控的工程师 | 阅读时长：约15分钟 | 核心价值：从零到生产级 Prometheus 指标暴露，少走弯路

监控大屏一片绿，业务同事却在告诉你服务挂了

这个故事发生在我们团队刚上线监控系统的第二周。

当时我用 prometheus/client_golang 给 Go 服务加了几个基础指标，CPU、内存、GC 次数全都接上了，Grafana 大屏看着绿油油的，感觉监控建设做得不错。

结果有天下午，业务同事来找我说"下单接口好像有问题，但又不报错"。我去 Grafana 一看，服务存活，内存正常，GC 没有飙升……一切正常。

排查了半天，发现是下单接口的数据库查询开始变慢，P99 从 10ms 涨到了 3 秒，但因为没有超时，请求没有报错只是变慢，业务用户等得不耐烦就放弃了。

而我的监控里压根没有"接口延迟"这个指标。

那次事故之后，我把业务监控重做了一遍，把真正有价值的指标搞清楚了：QPS（Counter）、接口延迟分布（Histogram）、正在处理的请求数（Gauge）、业务成功率（Counter 组合计算）。

Prometheus 四种指标类型

在写代码之前，先把四种指标类型的使用场景搞清楚：

类型	特点	典型用途
Counter	只增不减	请求总数、错误总数、处理消息数
Gauge	可增可减	当前并发数、队列长度、内存使用
Histogram	分桶统计分布	接口延迟、请求大小
Summary	分位数统计	和 Histogram 类似，但计算在客户端

关于 Histogram vs Summary 的选择：我建议优先用 Histogram，原因是：

Histogram 可以跨实例聚合（多个 Pod 的 P99 可以合并计算），Summary 不行
Histogram 的分位数可以在 PromQL 里动态调整，Summary 的分位数在代码里写死

完整实现代码

package metrics

import (
    "net/http"
    "strconv"
    "time"

    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

// AppMetrics 封装所有应用级指标
type AppMetrics struct {
    // HTTP 请求总数（Counter）
    // 标签：method（GET/POST等）、path（接口路径）、status（状态码）
    HTTPRequestsTotal *prometheus.CounterVec

    // HTTP 请求延迟分布（Histogram）
    HTTPRequestDuration *prometheus.HistogramVec

    // 当前正在处理的 HTTP 请求数（Gauge）
    HTTPRequestsInFlight *prometheus.GaugeVec

    // 业务指标：订单创建总数
    OrdersCreatedTotal *prometheus.CounterVec

    // 业务指标：当前队列长度
    QueueLength *prometheus.GaugeVec

    // 数据库查询延迟（Histogram）
    DBQueryDuration *prometheus.HistogramVec

    // 缓存命中/未命中（Counter）
    CacheOperationsTotal *prometheus.CounterVec
}

// NewAppMetrics 创建并注册所有指标
// promauto 会自动注册到默认 registry，panic on 注册失败
func NewAppMetrics(namespace, subsystem string) *AppMetrics {
    m := &AppMetrics{}

    m.HTTPRequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: namespace,
            Subsystem: subsystem,
            Name:      "http_requests_total",
            Help:      "HTTP 请求总数，按方法、路径、状态码分组",
        },
        []string{"method", "path", "status"},
    )

    // Histogram 的 Buckets 要根据实际业务延迟来设置
    // 默认的 DefBuckets 是 [.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10]
    // 对于内部 API，可以用更精细的小值 buckets
    m.HTTPRequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Namespace: namespace,
            Subsystem: subsystem,
            Name:      "http_request_duration_seconds",
            Help:      "HTTP 请求延迟分布（秒）",
            Buckets:   []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2, 5},
        },
        []string{"method", "path"},
    )

    m.HTTPRequestsInFlight = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Namespace: namespace,
            Subsystem: subsystem,
            Name:      "http_requests_in_flight",
            Help:      "当前正在处理的 HTTP 请求数",
        },
        []string{"method", "path"},
    )

    m.OrdersCreatedTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: namespace,
            Subsystem: "business",
            Name:      "orders_created_total",
            Help:      "订单创建总数",
        },
        []string{"channel", "status"}, // 下单渠道、成功/失败
    )

    m.QueueLength = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Namespace: namespace,
            Subsystem: "business",
            Name:      "queue_length",
            Help:      "消息队列当前长度",
        },
        []string{"queue_name"},
    )

    m.DBQueryDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Namespace: namespace,
            Subsystem: "db",
            Name:      "query_duration_seconds",
            Help:      "数据库查询延迟分布",
            Buckets:   []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1},
        },
        []string{"operation", "table"},
    )

    m.CacheOperationsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: namespace,
            Subsystem: "cache",
            Name:      "operations_total",
            Help:      "缓存操作总数",
        },
        []string{"operation", "result"}, // get/set/del, hit/miss/error
    )

    return m
}

// RecordHTTPRequest 记录一次 HTTP 请求（包含延迟和状态码）
func (m *AppMetrics) RecordHTTPRequest(method, path string, statusCode int, duration time.Duration) {
    status := strconv.Itoa(statusCode)
    m.HTTPRequestsTotal.WithLabelValues(method, path, status).Inc()
    m.HTTPRequestDuration.WithLabelValues(method, path).Observe(duration.Seconds())
}

// HTTPMiddleware 自动记录 HTTP 指标的中间件
func (m *AppMetrics) HTTPMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        path := r.URL.Path
        method := r.Method

        // 记录 in-flight 请求
        m.HTTPRequestsInFlight.WithLabelValues(method, path).Inc()
        defer m.HTTPRequestsInFlight.WithLabelValues(method, path).Dec()

        // 包装 ResponseWriter 以捕获状态码
        wrapped := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}

        start := time.Now()
        next.ServeHTTP(wrapped, r)
        duration := time.Since(start)

        m.RecordHTTPRequest(method, path, wrapped.statusCode, duration)
    })
}

// responseWriter 包装标准 ResponseWriter 以捕获状态码
type responseWriter struct {
    http.ResponseWriter
    statusCode int
}

func (rw *responseWriter) WriteHeader(statusCode int) {
    rw.statusCode = statusCode
    rw.ResponseWriter.WriteHeader(statusCode)
}

// RecordDBQuery 记录数据库查询
func (m *AppMetrics) RecordDBQuery(operation, table string, duration time.Duration) {
    m.DBQueryDuration.WithLabelValues(operation, table).Observe(duration.Seconds())
}

// RecordCacheHit 记录缓存命中
func (m *AppMetrics) RecordCacheHit(operation string, hit bool) {
    result := "miss"
    if hit {
        result = "hit"
    }
    m.CacheOperationsTotal.WithLabelValues(operation, result).Inc()
}

// SetupMetricsEndpoint 在 /metrics 路径暴露 Prometheus 指标
func SetupMetricsEndpoint(mux *http.ServeMux) {
    mux.Handle("/metrics", promhttp.Handler())
}

在业务代码中使用

package main

import (
    "context"
    "database/sql"
    "net/http"
    "time"

    "your-project/metrics"
)

var appMetrics *metrics.AppMetrics

func init() {
    appMetrics = metrics.NewAppMetrics("myapp", "api")
}

// 数据库查询包装器：自动记录延迟
func queryWithMetrics(ctx context.Context, db *sql.DB, operation, table, query string, args ...interface{}) (*sql.Rows, error) {
    start := time.Now()
    rows, err := db.QueryContext(ctx, query, args...)
    duration := time.Since(start)

    appMetrics.RecordDBQuery(operation, table, duration)
    return rows, err
}

// 业务处理函数示例
func createOrderHandler(w http.ResponseWriter, r *http.Request) {
    // 模拟业务处理
    err := processOrder(r.Context())

    if err != nil {
        // 记录业务失败指标
        appMetrics.OrdersCreatedTotal.WithLabelValues("web", "failed").Inc()
        http.Error(w, err.Error(), http.StatusInternalServerError)
        return
    }

    // 记录业务成功指标
    appMetrics.OrdersCreatedTotal.WithLabelValues("web", "success").Inc()
    w.WriteHeader(http.StatusCreated)
}

func processOrder(ctx context.Context) error {
    // 示例：带指标记录的数据库操作
    // rows, err := queryWithMetrics(ctx, db, "select", "orders", "SELECT ...")
    return nil
}

func main() {
    mux := http.NewServeMux()

    // 注册业务路由（套上指标中间件）
    mux.Handle("/api/", appMetrics.HTTPMiddleware(http.HandlerFunc(createOrderHandler)))

    // 暴露 Prometheus 指标端点
    metrics.SetupMetricsEndpoint(mux)

    http.ListenAndServe(":8080", mux)
}

Prometheus 抓取配置

# prometheus.yml
scrape_configs:
  - job_name: 'myapp'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      # 只抓取有 prometheus.io/scrape: "true" 注解的 Pod
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: "true"
      # 使用注解里指定的路径（默认 /metrics）
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      # 使用注解里指定的端口
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: (.+)
        replacement: $1

# Deployment 里的 Pod 注解
metadata:
  annotations:
    prometheus.io/scrape: "true"
    prometheus.io/path: "/metrics"
    prometheus.io/port: "8080"

常用 PromQL 查询示例

# QPS（最近 5 分钟）
rate(myapp_api_http_requests_total[5m])

# 错误率
rate(myapp_api_http_requests_total{status=~"5.."}[5m])
/ rate(myapp_api_http_requests_total[5m])

# P99 延迟（需要 Histogram）
histogram_quantile(0.99,
  rate(myapp_api_http_request_duration_seconds_bucket[5m])
)

# 按接口路径的平均延迟
rate(myapp_api_http_request_duration_seconds_sum[5m])
/ rate(myapp_api_http_request_duration_seconds_count[5m])

# 缓存命中率
rate(myapp_cache_operations_total{result="hit"}[5m])
/ rate(myapp_cache_operations_total{operation="get"}[5m])

三个踩坑实录

坑一：Label 基数爆炸，Prometheus OOM

现象：接入监控后，Prometheus 内存持续增长，几天后 OOM。

原因：我把 HTTP 请求的完整 URL（包含查询参数）作为 label，比如 /api/orders?id=123456&user=789。每个唯一的 URL 都会生成一个新的时间序列，很快就有了数百万个序列。

解法：Label 的值要是有限集合，绝对不能用用户 ID、订单 ID 这类高基数值作为 label。接口路径只用路由模板（/api/orders/{id}），不用实际路径（/api/orders/123456）。

// 错误：用实际路径
path := r.URL.Path // "/api/orders/123456" - 每个订单ID都不同！

// 正确：用路由模板
// 如果用 gorilla/mux:
vars := mux.Vars(r)
path := r.URL.Path
route := mux.CurrentRoute(r)
if route != nil {
    if template, err := route.GetPathTemplate(); err == nil {
        path = template // "/api/orders/{id}"
    }
}

坑二：Histogram Buckets 设置不合理

现象：P99 监控显示一直在最高的 bucket（10s），但实际上大部分请求都是 100ms 以内完成的。

原因：用了默认的 prometheus.DefBuckets，最小的 bucket 是 5ms，对于我们这种毫秒级接口，所有请求都落在同一个 bucket 里，无法区分。

解法：根据业务特性自定义 buckets。内部 API 设 1ms、5ms、10ms、25ms、50ms、100ms 这样精细的 bucket；对外 API 可以粗一些。

坑三：Counter 重置后曲线掉到 0

现象：Grafana 里 QPS 曲线每隔几小时就会突然掉到 0，然后重新爬升。

原因：服务重启后，Counter 从 0 开始计数，Prometheus 检测到 Counter 值减小，认为发生了重置，rate() 函数会自动处理这种情况（返回 0）。但如果用 increase() 而不是 rate()，展示上会有问题。

解法：用 rate() 而不是直接展示 Counter 的绝对值，rate() 会处理 Counter 重置。如果需要展示累计值，用 increase()。

Java 对比

Java 里 Micrometer + Actuator 的组合，底层对接 Prometheus 也是类似的 Counter/Gauge/Timer（Timer 对应 Histogram）。

主要差异在于：Java 里通过 @Timed 注解就能给方法加延迟监控，框架帮你做了很多自动装配。

Go 里需要显式地在代码里调用 RecordHTTPRequest，代码量更多，但控制也更精细。我更喜欢 Go 这种方式，因为你清楚地知道每一个指标是在哪里被记录的，不容易出现"指标为什么在这里"的困惑。

小结

Go 接入 Prometheus 的要点：

Label 基数要控制：高基数 label 是 Prometheus OOM 的头号杀手
Histogram Buckets 要自定义：根据业务延迟特征设置合理的分桶
用独立端口暴露 /metrics：不要和业务端口混在一起
Counter 用 rate() 查询：直接看 Counter 绝对值没什么意义
业务指标比系统指标更有价值：CPU/内存是背景，订单成功率才是核心