Go 开发系统监控工具——CPU/内存/磁盘/网络采集的完整实现
Go 开发系统监控工具——CPU/内存/磁盘/网络采集的完整实现
适读人群:Go 开发者、想自己实现轻量级监控 Agent 的运维工程师 | 阅读时长:约 16 分钟 | 核心价值:用纯 Go 实现系统指标采集,不依赖 Prometheus agent,可嵌入任何 Go 服务
去年底帮一个做在线教育的团队排查性能问题,他们有几十台机器,部署的是 Prometheus + Node Exporter 的监控方案。但有几台特殊的边缘节点,环境受限,装不了 Node Exporter,但又需要监控 CPU 和内存。
他们问我能不能在自己的 Go 服务里直接埋点,把系统指标一起上报。我说当然可以,Go 采集系统指标不需要外部工具,几百行代码就能搞定。
这篇文章把那次实现的完整代码整理出来。
工具选型
Go 里做系统指标采集,主要靠 github.com/shirou/gopsutil/v3,这是最完善的系统信息库,支持 Linux/macOS/Windows。
go get github.com/shirou/gopsutil/v3如果追求极致轻量,也可以直接读 /proc 文件系统,但跨平台兼容就自己搞了。gopsutil 已经处理好了跨平台差异。
完整实现
package monitor
import (
"context"
"fmt"
"sync"
"time"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/mem"
"github.com/shirou/gopsutil/v3/net"
"github.com/shirou/gopsutil/v3/process"
)
// SystemMetrics 系统指标快照
type SystemMetrics struct {
Timestamp time.Time
CPU CPUMetrics
Memory MemoryMetrics
Disks []DiskMetrics
Networks []NetworkMetrics
TopProcesses []ProcessMetrics
}
// CPUMetrics CPU 指标
type CPUMetrics struct {
UsagePercent float64 // 总体 CPU 使用率
PerCorePercent []float64 // 每个核的使用率
LoadAvg1m float64 // 1 分钟负载均值
LoadAvg5m float64 // 5 分钟
LoadAvg15m float64 // 15 分钟
}
// MemoryMetrics 内存指标
type MemoryMetrics struct {
Total uint64 // 总内存(字节)
Used uint64
Free uint64
UsedPercent float64
SwapTotal uint64
SwapUsed uint64
}
// DiskMetrics 磁盘指标
type DiskMetrics struct {
Path string
Total uint64
Used uint64
UsedPercent float64
ReadBytes uint64 // 自上次采集以来的读取量
WriteBytes uint64
IOWait float64
}
// NetworkMetrics 网络指标
type NetworkMetrics struct {
Interface string
BytesSent uint64 // 自上次采集以来的发送量
BytesRecv uint64
PacketsSent uint64
PacketsRecv uint64
}
// ProcessMetrics 进程指标
type ProcessMetrics struct {
PID int32
Name string
CPUPercent float64
MemoryMB float64
Status string
}
// Collector 指标采集器
type Collector struct {
mu sync.RWMutex
lastDiskStats map[string]disk.IOCountersStat
lastNetStats map[string]net.IOCountersStat
lastCollectAt time.Time
}
func NewCollector() *Collector {
return &Collector{
lastDiskStats: make(map[string]disk.IOCountersStat),
lastNetStats: make(map[string]net.IOCountersStat),
}
}
// Collect 采集一次系统指标
func (c *Collector) Collect(ctx context.Context) (*SystemMetrics, error) {
metrics := &SystemMetrics{Timestamp: time.Now()}
var wg sync.WaitGroup
var mu sync.Mutex
var errs []error
// 并发采集各项指标,提高效率
wg.Add(4)
// CPU
go func() {
defer wg.Done()
cpuMetrics, err := collectCPU(ctx)
if err != nil {
mu.Lock()
errs = append(errs, fmt.Errorf("cpu: %w", err))
mu.Unlock()
return
}
mu.Lock()
metrics.CPU = cpuMetrics
mu.Unlock()
}()
// Memory
go func() {
defer wg.Done()
memMetrics, err := collectMemory()
if err != nil {
mu.Lock()
errs = append(errs, fmt.Errorf("memory: %w", err))
mu.Unlock()
return
}
mu.Lock()
metrics.Memory = memMetrics
mu.Unlock()
}()
// Disk
go func() {
defer wg.Done()
diskMetrics, err := c.collectDisk()
if err != nil {
mu.Lock()
errs = append(errs, fmt.Errorf("disk: %w", err))
mu.Unlock()
return
}
mu.Lock()
metrics.Disks = diskMetrics
mu.Unlock()
}()
// Network
go func() {
defer wg.Done()
netMetrics, err := c.collectNetwork()
if err != nil {
mu.Lock()
errs = append(errs, fmt.Errorf("network: %w", err))
mu.Unlock()
return
}
mu.Lock()
metrics.Networks = netMetrics
mu.Unlock()
}()
wg.Wait()
if len(errs) > 0 {
// 部分采集失败时返回已有数据,不完全失败
fmt.Printf("Warning: some metrics collection failed: %v\n", errs)
}
// 采集 Top 5 进程(单独处理,较耗时)
topProcs, _ := collectTopProcesses(5)
metrics.TopProcesses = topProcs
c.lastCollectAt = metrics.Timestamp
return metrics, nil
}
func collectCPU(ctx context.Context) (CPUMetrics, error) {
var metrics CPUMetrics
// 采集总体 CPU(需要 100ms 间隔)
percents, err := cpu.PercentWithContext(ctx, 100*time.Millisecond, false)
if err != nil {
return metrics, err
}
if len(percents) > 0 {
metrics.UsagePercent = percents[0]
}
// 采集每个核
perCore, err := cpu.PercentWithContext(ctx, 0, true)
if err == nil {
metrics.PerCorePercent = perCore
}
return metrics, nil
}
func collectMemory() (MemoryMetrics, error) {
var metrics MemoryMetrics
vmStat, err := mem.VirtualMemory()
if err != nil {
return metrics, err
}
metrics.Total = vmStat.Total
metrics.Used = vmStat.Used
metrics.Free = vmStat.Free
metrics.UsedPercent = vmStat.UsedPercent
swapStat, err := mem.SwapMemory()
if err == nil {
metrics.SwapTotal = swapStat.Total
metrics.SwapUsed = swapStat.Used
}
return metrics, nil
}
func (c *Collector) collectDisk() ([]DiskMetrics, error) {
partitions, err := disk.Partitions(false)
if err != nil {
return nil, err
}
// 采集磁盘 IO 计数器
ioCounters, _ := disk.IOCounters()
var metrics []DiskMetrics
for _, p := range partitions {
usage, err := disk.Usage(p.Mountpoint)
if err != nil {
continue
}
dm := DiskMetrics{
Path: p.Mountpoint,
Total: usage.Total,
Used: usage.Used,
UsedPercent: usage.UsedPercent,
}
// 计算本次与上次之间的 IO 差值
c.mu.Lock()
if curr, ok := ioCounters[p.Device]; ok {
if last, hasLast := c.lastDiskStats[p.Device]; hasLast {
dm.ReadBytes = curr.ReadBytes - last.ReadBytes
dm.WriteBytes = curr.WriteBytes - last.WriteBytes
}
c.lastDiskStats[p.Device] = curr
}
c.mu.Unlock()
metrics = append(metrics, dm)
}
return metrics, nil
}
func (c *Collector) collectNetwork() ([]NetworkMetrics, error) {
counters, err := net.IOCounters(true)
if err != nil {
return nil, err
}
var metrics []NetworkMetrics
c.mu.Lock()
defer c.mu.Unlock()
for _, counter := range counters {
if counter.Name == "lo" {
continue // 跳过回环接口
}
nm := NetworkMetrics{
Interface: counter.Name,
BytesSent: counter.BytesSent,
BytesRecv: counter.BytesRecv,
PacketsSent: counter.PacketsSent,
PacketsRecv: counter.PacketsRecv,
}
// 计算差值(速率)
if last, ok := c.lastNetStats[counter.Name]; ok {
nm.BytesSent = counter.BytesSent - last.BytesSent
nm.BytesRecv = counter.BytesRecv - last.BytesRecv
}
c.lastNetStats[counter.Name] = counter
metrics = append(metrics, nm)
}
return metrics, nil
}
func collectTopProcesses(n int) ([]ProcessMetrics, error) {
procs, err := process.Processes()
if err != nil {
return nil, err
}
type procInfo struct {
pm ProcessMetrics
cpu float64
}
var infos []procInfo
for _, p := range procs {
name, _ := p.Name()
cpuPct, _ := p.CPUPercent()
memInfo, _ := p.MemoryInfo()
var memMB float64
if memInfo != nil {
memMB = float64(memInfo.RSS) / 1024 / 1024
}
status, _ := p.Status()
statusStr := "unknown"
if len(status) > 0 {
statusStr = status[0]
}
infos = append(infos, procInfo{
pm: ProcessMetrics{
PID: p.Pid,
Name: name,
CPUPercent: cpuPct,
MemoryMB: memMB,
Status: statusStr,
},
cpu: cpuPct,
})
}
// 按 CPU 排序,取前 N 个
sort.Slice(infos, func(i, j int) bool {
return infos[i].cpu > infos[j].cpu
})
var result []ProcessMetrics
for i := 0; i < n && i < len(infos); i++ {
result = append(result, infos[i].pm)
}
return result, nil
}持续采集 + 告警
// MonitorDaemon 持续监控守护进程
type MonitorDaemon struct {
collector *Collector
interval time.Duration
alertCh chan Alert
}
type Alert struct {
Level string // warning / critical
Message string
Metrics *SystemMetrics
}
func (d *MonitorDaemon) Run(ctx context.Context) {
ticker := time.NewTicker(d.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
metrics, err := d.collector.Collect(ctx)
if err != nil {
fmt.Println("collect error:", err)
continue
}
// 检查告警条件
d.checkAlerts(metrics)
// 打印摘要
d.printSummary(metrics)
}
}
}
func (d *MonitorDaemon) checkAlerts(m *SystemMetrics) {
if m.CPU.UsagePercent > 90 {
d.alertCh <- Alert{
Level: "critical",
Message: fmt.Sprintf("CPU 使用率 %.1f%% 超过 90%%", m.CPU.UsagePercent),
Metrics: m,
}
}
if m.Memory.UsedPercent > 85 {
d.alertCh <- Alert{
Level: "warning",
Message: fmt.Sprintf("内存使用率 %.1f%% 超过 85%%", m.Memory.UsedPercent),
Metrics: m,
}
}
for _, disk := range m.Disks {
if disk.UsedPercent > 90 {
d.alertCh <- Alert{
Level: "critical",
Message: fmt.Sprintf("磁盘 %s 使用率 %.1f%% 超过 90%%", disk.Path, disk.UsedPercent),
Metrics: m,
}
}
}
}
func (d *MonitorDaemon) printSummary(m *SystemMetrics) {
fmt.Printf("[%s] CPU: %.1f%% | MEM: %.1f%% (%.1fGB/%.1fGB)\n",
m.Timestamp.Format("15:04:05"),
m.CPU.UsagePercent,
m.Memory.UsedPercent,
float64(m.Memory.Used)/1024/1024/1024,
float64(m.Memory.Total)/1024/1024/1024,
)
}踩坑实录
踩坑 1:CPU 采集返回 0%
现象:cpu.Percent(0, false) 返回的结果是 0。
原因:interval 设为 0 表示"自进程启动以来的平均值",第一次调用时进程刚启动,结果趋近于 0。
解法:interval 至少设 100ms,让内核有时间做两次采样:
cpu.Percent(100*time.Millisecond, false)或者第一次调用获取基准,第二次调用才取差值。
踩坑 2:磁盘 IO 统计的设备名和挂载点对不上
现象:disk.IOCounters() 返回的 key 是设备名(如 sda),但 disk.Partitions() 返回的 Device 是完整路径(如 /dev/sda1)。
原因:IOCounters 的 key 是不带路径的设备名,而 Partition 的 Device 是 /dev/xxx。
解法:在做关联时需要做字符串处理:
deviceName := filepath.Base(p.Device) // "/dev/sda1" -> "sda1"
if curr, ok := ioCounters[deviceName]; ok { ... }踩坑 3:gopsutil 在容器里拿到的是宿主机数据
现象:在 Docker 容器里运行监控,采集到的 CPU 和内存数据是宿主机的,不是容器自己被分配的资源。
原因:gopsutil 读 /proc 是宿主机的 /proc,容器的 cgroup 限制需要单独读。
解法:容器环境下要读 cgroup 文件来获取容器自身的资源限制和使用量:
// 读取容器内存使用(cgroup v1)
memUsage, _ := os.ReadFile("/sys/fs/cgroup/memory/memory.usage_in_bytes")
memLimit, _ := os.ReadFile("/sys/fs/cgroup/memory/memory.limit_in_bytes")Prometheus 格式输出
如果需要和 Prometheus 集成,可以把采集结果暴露为 /metrics 接口:
import "github.com/prometheus/client_golang/prometheus"
var cpuGauge = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "system_cpu_usage_percent",
Help: "CPU usage in percent",
})
func updatePrometheusMetrics(m *SystemMetrics) {
cpuGauge.Set(m.CPU.UsagePercent)
// ... 其他指标
}这样自研的采集 Agent 就能和 Prometheus 生态无缝集成。
