package main

import (
    "fmt"
    "net/http"
    _ "net/http/pprof" // 注册pprof handler
)

func main() {
    // 业务服务
    go func() {
        http.ListenAndServe(":8080", nil)
    }()

    // pprof服务（单独端口，不要暴露到公网！）
    go func() {
        fmt.Println("pprof server at :6060")
        http.ListenAndServe(":6060", nil)
    }()

    // 模拟业务负载
    select {}
}

可用的pprof endpoint：

http://localhost:6060/debug/pprof/         # 概览
http://localhost:6060/debug/pprof/heap     # 内存heap profile
http://localhost:6060/debug/pprof/goroutine # goroutine数量和栈
http://localhost:6060/debug/pprof/profile?seconds=30 # 采集30秒CPU profile
http://localhost:6060/debug/pprof/trace?seconds=5    # trace数据

方式2：代码内嵌（适合benchmark测试）

package main

import (
    "os"
    "runtime/pprof"
    "time"
)

func main() {
    // CPU Profile
    f, _ := os.Create("cpu.prof")
    pprof.StartCPUProfile(f)
    defer pprof.StopCPUProfile()

    // 你的代码...
    heavyWork()

    // Heap Profile（在程序结束时写入）
    mf, _ := os.Create("mem.prof")
    defer func() {
        pprof.WriteHeapProfile(mf)
        mf.Close()
    }()
}

func heavyWork() {
    time.Sleep(time.Second)
}

方式3：go test 集成

# 运行测试并采集CPU/内存profile
go test -cpuprofile=cpu.prof -memprofile=mem.prof -bench=. ./...

四、分析 CPU Profile

# 采集30秒的CPU profile
curl -o cpu.prof http://localhost:6060/debug/pprof/profile?seconds=30

# 进入交互式分析
go tool pprof cpu.prof

# 常用命令
(pprof) top           # 按消耗时间排序，列出前10个函数
(pprof) top -cum      # 按累计时间排序（包含子函数调用）
(pprof) list <func>   # 查看某个函数每行代码的CPU消耗
(pprof) web           # 在浏览器里显示火焰图（需要graphviz）

火焰图分析

# 生成火焰图（推荐使用pprof的web界面）
go tool pprof -http=:8090 cpu.prof
# 然后访问 http://localhost:8090/ui/flamegraph

火焰图怎么看：

X轴：不代表时间顺序，代表CPU时间占比（越宽越耗CPU）
Y轴：调用栈（底部是入口函数，越高越深层）
找最宽的「平台」：那就是CPU热点

五、实战：CPU性能问题排查

案例1：正则表达式重复编译（小江的问题）

package main

import (
    "regexp"
    "testing"
)

// 错误版本：每次调用都编译正则
func validateEmailBad(email string) bool {
    re := regexp.MustCompile(`^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$`)
    return re.MatchString(email)
}

// 正确版本：包级别编译一次
var emailRegexp = regexp.MustCompile(`^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$`)

func validateEmailGood(email string) bool {
    return emailRegexp.MatchString(email)
}

// Benchmark对比
func BenchmarkValidateEmailBad(b *testing.B) {
    for i := 0; i < b.N; i++ {
        validateEmailBad("user@example.com")
    }
}

func BenchmarkValidateEmailGood(b *testing.B) {
    for i := 0; i < b.N; i++ {
        validateEmailGood("user@example.com")
    }
}

// 运行：go test -bench=. -benchmem
// 差距通常在100倍以上

案例2：字符串拼接的正确姿势

package main

import (
    "strings"
    "testing"
)

// 性能测试：四种字符串拼接方式
func BenchmarkStringConcat(b *testing.B) {
    parts := []string{"Hello", ", ", "World", "!"}

    b.Run("Plus", func(b *testing.B) {
        for i := 0; i < b.N; i++ {
            s := ""
            for _, p := range parts {
                s += p // 每次都分配新字符串
            }
        }
    })

    b.Run("Sprintf", func(b *testing.B) {
        for i := 0; i < b.N; i++ {
            _ = fmt.Sprintf("%s%s%s%s", parts[0], parts[1], parts[2], parts[3])
        }
    })

    b.Run("Builder", func(b *testing.B) {
        for i := 0; i < b.N; i++ {
            var sb strings.Builder
            for _, p := range parts {
                sb.WriteString(p)
            }
            _ = sb.String()
        }
    })

    b.Run("Join", func(b *testing.B) {
        for i := 0; i < b.N; i++ {
            _ = strings.Join(parts, "")
        }
    })
}

六、内存 Profile 分析

# 采集内存profile
curl -o heap.prof http://localhost:6060/debug/pprof/heap

# 分析
go tool pprof -alloc_space heap.prof  # 按总分配量分析
go tool pprof -inuse_space heap.prof  # 按当前使用量分析

(pprof) top           # 找内存分配最多的函数
(pprof) list <func>   # 看具体哪行代码分配的

内存泄漏排查

package main

import (
    "fmt"
    "net/http"
    _ "net/http/pprof"
    "runtime"
    "time"
)

func main() {
    go func() {
        http.ListenAndServe(":6060", nil)
    }()

    // 周期性打印内存统计
    go func() {
        for {
            var m runtime.MemStats
            runtime.ReadMemStats(&m)
            fmt.Printf("Heap: %dMB, Sys: %dMB, NumGC: %d, Goroutines: %d\n",
                m.HeapAlloc/1024/1024,
                m.Sys/1024/1024,
                m.NumGC,
                runtime.NumGoroutine(),
            )
            time.Sleep(5 * time.Second)
        }
    }()

    select {}
}

如果HeapAlloc持续增长且NumGC之后没有回落，说明有内存泄漏。

七、trace 分析：最细粒度的性能视图

trace 记录了Go程序内部的所有事件：goroutine的创建/阻塞/调度、GC事件、系统调用等。

# 采集5秒trace
curl -o trace.out http://localhost:6060/debug/pprof/trace?seconds=5

# 在浏览器里查看
go tool trace trace.out

trace视图里能看到：

goroutine调度时间线：每个goroutine在哪段时间运行、阻塞、等待
GC事件：STW停顿的时间和位置
系统调用：哪些调用花了多少时间
网络IO延迟：连接等待、读写耗时

八、完整优化案例：从200ms到8ms

我曾经优化过一个接口，原来P99是200ms，pprof揭示了三个问题：

问题1：JSON序列化占了40% CPU

pprof显示：大量CPU消耗在 encoding/json.Marshal 上，而且每次都序列化同样的静态配置数据。

优化：加了一层Redis缓存，静态数据序列化一次存Redis，请求来了直接读缓存返回。

问题2：数据库查询N+1

内存profile显示大量小对象分配，追踪到GORM的查询中。上线pprof后发现每次请求发了50+条SQL。

优化：加了Preload，50条SQL变成2条。

问题3：string格式化日志

pprof显示 fmt.Sprintf 消耗了12% CPU，原因是每条日志都用Sprintf拼接字段。

优化：换成zap的强类型字段，CPU消耗降到1%以下。

三个优化加起来，P99从200ms降到了8ms。

九、benchmark测试：开发阶段的性能守门员

pprof是线上问题的诊断工具，benchmark是开发阶段的质量保证：

package main

import (
    "fmt"
    "testing"
)

// 被测函数
func sum(n int) int {
    total := 0
    for i := 1; i <= n; i++ {
        total += i
    }
    return total
}

func sumFormula(n int) int {
    return n * (n + 1) / 2 // 数学公式，O(1)
}

func BenchmarkSum(b *testing.B) {
    b.ResetTimer() // 重置计时器（排除setup时间）
    for i := 0; i < b.N; i++ {
        sum(1000)
    }
}

func BenchmarkSumFormula(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sumFormula(1000)
    }
}

// 对比两个方案的内存分配
func BenchmarkAllocations(b *testing.B) {
    b.Run("make-slice", func(b *testing.B) {
        b.ReportAllocs()
        for i := 0; i < b.N; i++ {
            s := make([]int, 0, 100)
            for j := 0; j < 100; j++ {
                s = append(s, j)
            }
        }
    })

    b.Run("no-prealloc", func(b *testing.B) {
        b.ReportAllocs()
        for i := 0; i < b.N; i++ {
            s := make([]int, 0)
            for j := 0; j < 100; j++ {
                s = append(s, j)
            }
        }
    })
}

func main() {
    fmt.Println("使用 go test -bench=. -benchmem 运行")
}

十、性能优化的方法论

经过多次性能优化实战，我总结出一套方法：

第一步：确认问题存在

用监控数据（P99、CPU%、内存）确认有问题
不要在没有数据的情况下优化

第二步：定位热点

用pprof采集CPU和内存profile
找出消耗最多资源的前3个函数
不要靠直觉猜

第三步：理解原因

为什么这个函数消耗这么多资源？
是算法问题？数据结构问题？还是调用频率问题？

第四步：针对性优化

算法问题：换算法（O(n²)→O(n log n)）
内存分配问题：sync.Pool、预分配、减少逃逸
调用频率问题：缓存、批量处理、减少重复计算

第五步：验证效果

用benchmark验证优化前后的差距
上线后用监控数据验证实际效果
保留优化前的benchmark作为基准

十一、Java vs Go 性能工具对比

工具	Java	Go
CPU分析	JProfiler/YourKit	pprof CPU profile
内存分析	JVisualVM/MAT	pprof heap profile
线程/协程	jstack	pprof goroutine
火焰图	async-profiler	pprof + go tool pprof -http
GC分析	GC日志 + GCViewer	GODEBUG=gctrace=1
实时监控	JMX + JConsole	expvar + /debug/pprof

十二、总结

Go性能分析的完整工具链：

pprof HTTP接口：生产环境长期开启（仅内网访问），随时可采集
CPU profile + 火焰图：找CPU热点，定位高频调用函数
Heap profile：找内存分配热点，发现泄漏
trace：分析goroutine调度、GC暂停的细节
benchmark：开发阶段的性能守门员，数字说话

小江后来成了团队里pprof用得最溜的人——因为他踩了"凭感觉优化"的坑之后，真正理解了数据驱动的价值。

性能优化的本质是：先量化，再优化，最后验证。任何没有数据支撑的优化，都是在猜谜。