Go语言故障排查:线上问题处理
Go语言故障排查线上问题处理1. 引言线上故障是每个技术团队都必须面对的挑战。如何快速定位问题、恢复服务、减少损失是每个工程师需要掌握的技能。本文将深入讲解Go语言微服务中的故障排查方法包括日志分析、pprof性能分析、trace分析、常见问题处理和复盘流程。2. 日志分析2.1 结构化日志设计package logging import ( os time github.com/rs/zerolog github.com/rs/zerolog/log ) // InitLogger 初始化日志 func InitLogger(env string) { zerolog.TimeFieldFormat time.RFC3339 if env production { // 生产环境JSON格式输出 log.Logger zerolog.New(os.Stdout).With().Timestamp().Caller().Logger() } else { // 开发环境彩色控制台输出 log.Logger zerolog.New(zerolog.ConsoleWriter{Out: os.Stdout}).With().Timestamp().Caller().Logger() } zerolog.SetGlobalLevel(zerolog.InfoLevel) } // RequestLogger 请求日志中间件 func RequestLogger() gin.HandlerFunc { return func(c *gin.Context) { start : time.Now() path : c.Request.URL.Path c.Next() latency : time.Since(start) status : c.Writer.Status() log.Info(). Str(method, c.Request.Method). Str(path, path). Int(status, status). Dur(latency, latency). Str(client_ip, c.ClientIP()). Str(user_agent, c.Request.UserAgent()). Int(body_size, c.Writer.Size()). Msg(request completed) } } // ErrorLogger 错误日志 func LogError(err error, context map[string]interface{}) { fields : make([]interface{}, 0, len(context)*2) for k, v : range context { fields append(fields, k, v) } log.Error(). Err(err). Fields(fields). Msg(operation failed) }2.2 日志查询技巧# 使用jq进行日志分析 # 提取错误日志 cat app.log | jq select(.level error) # 提取特定用户的日志 cat app.log | jq select(.user_id 12345) # 提取慢请求 (延迟 1s) cat app.log | jq select(.latency 1) | {path, latency, status} # 统计每分钟错误数 cat app.log | jq -r select(.level error) | .timestamp[:16] | sort | uniq -c # 关联用户ID和错误 cat app.log | jq select(.level error) | {timestamp, user_id, error: .error.message} # Loki查询示例 # 查询最近5分钟的错误日志 {jobmyapp, levelerror} | database # 查询特定用户的完整请求流程 {jobmyapp} | user_id12345 | json # 统计错误趋势 sum by (level) (count_over_time({jobmyapp} | error [5m]))2.3 分布式日志追踪package trace import ( context github.com/google/uuid ) type contextKey string const ( RequestIDKey contextKey request_id UserIDKey contextKey user_id ) // WithRequestID 添加请求ID到context func WithRequestID(ctx context.Context, requestID string) context.Context { return context.WithValue(ctx, RequestIDKey, requestID) } // GetRequestID 从context获取请求ID func GetRequestID(ctx context.Context) string { if id, ok : ctx.Value(RequestIDKey).(string); ok { return id } return uuid.New().String() } // WithUserID 添加用户ID到context func WithUserID(ctx context.Context, userID uint) context.Context { return context.WithValue(ctx, UserIDKey, userID) } // GetUserID 从context获取用户ID func GetUserID(ctx context.Context) uint { if id, ok : ctx.Value(UserIDKey).(uint); ok { return id } return 0 } // Middleware 添加请求ID中间件 func RequestIDMiddleware() gin.HandlerFunc { return func(c *gin.Context) { requestID : c.GetHeader(X-Request-ID) if requestID { requestID uuid.New().String() } c.Set(request_id, requestID) c.Header(X-Request-ID, requestID) ctx : context.WithValue(c.Request.Context(), RequestIDKey, requestID) c.Request c.Request.WithContext(ctx) c.Next() } }3. pprof性能分析3.1 pprof配置package main import ( net/http _ net/http/pprof runtime github.com/gin-gonic/gin ) func main() { // 启用pprof go func() { // pprof页面 http.HandleFunc(/debug/pprof/, http.DefaultServeMux.ServeHTTP) http.HandleFunc(/debug/pprof/cmdline, pprof.Cmdline) http.HandleFunc(/debug/pprof/profile, pprof.Profile) http.HandleFunc(/debug/pprof/symbol, pprof.Symbol) http.HandleFunc(/debug/pprof/trace, pprof.Trace) http.ListenAndServe(:6060, nil) }() // 业务代码 r : gin.Default() r.GET(/health, func(c *gin.Context) { c.JSON(200, gin.H{status: ok}) }) r.Run(:8080) } // 设置GC配置 func init() { // 设置GOMAXPROCS runtime.GOMAXPROCS(runtime.NumCPU()) // 设置GOGC // 环境变量 GOGC100 (默认) 表示GC触发时堆增长100% // 降低GOGC可以减少内存使用但增加CPU使用 }3.2 CPU分析# 获取CPU profile (30秒) go tool pprof http://localhost:6060/debug/pprof/profile # 或者使用curl curl -o cpu.prof http://localhost:6060/debug/pprof/profile?seconds30 # 交互式分析 go tool pprof -http:8081 cpu.prof # 命令行分析 go tool pprof cpu.prof (pprof) top 10 (pprof) web # 生成SVG图 # 火焰图分析 # 安装火焰图工具 go install github.com/uber/go-torchlatest # 生成火焰图 go-torch -f cpu.prof -o cpu.svg3.3 内存分析# 获取堆profile curl http://localhost:6060/debug/pprof/heap -o heap.prof # 分析内存分配 go tool pprof -http:8081 heap.prof # 查看对象数量 (pprof) top -cum (pprof) list function_name # 查看特定函数详情 # 对比两次heap go tool pprof -baseheap1.prof heap2.prof # 查看goroutine profile (排查goroutine泄漏) curl http://localhost:6060/debug/pprof/goroutine -o goroutine.prof # 查看goroutine数量随时间变化 watch -n 5 curl -s http://localhost:6060/debug/pprof/goroutine | wc -l3.4 Goroutine分析package debug import ( net/http runtime runtime/pprof time ) // DumpGoroutines 导出goroutine堆栈 func DumpGoroutines() { pprof.Lookup(goroutine).WriteTo(os.Stdout, 1) } // 定期检查goroutine数量 func MonitorGoroutines(interval time.Duration, maxCount int) { ticker : time.NewTicker(interval) defer ticker.Stop() for range ticker.C { count : runtime.NumGoroutine() if count maxCount { log.Warn().Int(goroutines, count).Int(max, maxCount).Msg(goroutine count exceeded) DumpGoroutines() } } }4. Trace分析4.1 OpenTelemetry Tracepackage tracing import ( context fmt go.opentelemetry.io/otel go.opentelemetry.io/otel/attribute go.opentelemetry.io/otel/exporters/jaeger go.opentelemetry.io/otel/sdk/trace ) func InitTracer(serviceName, endpoint string) (func(context.Context) error, error) { exporter, err : jaeger.New(jaeger.WithCollectorEndpoint(jaeger.WithEndpoint(endpoint))) if err ! nil { return nil, err } tp : trace.NewTracerProvider( trace.WithBatcher(exporter), trace.WithSampler(trace.AlwaysSample()), ) otel.SetTracerProvider(tp) return tp.Shutdown, nil } // TraceFunc 带追踪的函数执行 func TraceFunc(ctx context.Context, name string, fn func(context.Context) error) error { ctx, span : otel.Tracer().Start(ctx, name) defer span.End() if err : fn(ctx); err ! nil { span.RecordError(err) span.SetAttributes(attribute.Bool(error, true)) return err } return nil } // Gin中间件 func TracingMiddleware() gin.HandlerFunc { tracer : otel.Tracer() return func(c *gin.Context) { ctx : c.Request.Context() ctx, span : tracer.Start(ctx, fmt.Sprintf(%s %s, c.Request.Method, c.FullPath())) defer span.End() span.SetAttributes( attribute.String(http.method, c.Request.Method), attribute.String(http.url, c.Request.URL.String()), attribute.String(http.host, c.Request.Host), ) c.Request c.Request.WithContext(ctx) c.Next() span.SetAttributes(attribute.Int(http.status_code, c.Writer.Status())) } }4.2 Trace查询# Jaeger查询 # 查找特定trace ID trace_idabc123def456 # 查找包含特定操作的trace servicemyapp operation/api/v1/users # 查找慢trace servicemyapp duration5s # 查找错误的trace servicemyapp tag.errortrue # Loki Trace关联 {servicemyapp} | error | json | trace_id~.*5. 常见问题处理5.1 CPU问题排查package diagnose import ( os runtime runtime/pprof time ) // CPUProfile 采集CPU profile func CPUProfile(duration time.Duration) (string, error) { // 创建临时文件 f, err : os.CreateTemp(, cpu*.prof) if err ! nil { return , err } defer f.Close() // 开始采集 if err : pprof.StartCPUProfile(f); err ! nil { return , err } // 等待采集完成 time.Sleep(duration) pprof.StopCPUProfile() return f.Name(), nil } // AnalyzeCPU 分析CPU使用 func AnalyzeCPU() string { var stats runtime.MemStats runtime.ReadMemStats(stats) return fmt.Sprintf( CPU分析: GOMAXPROCS: %d NumCPU: %d NumGoroutine: %d GC信息: GC次数: %d GC暂停时间: %s GC CPU占用: %.2f%% , runtime.GOMAXPROCS(0), runtime.NumCPU(), runtime.NumGoroutine(), stats.NumGC, time.Duration(stats.PauseTotalNs), float64(stats.GCCPUFraction)*100, ) }5.2 内存问题排查package diagnose import ( fmt os runtime runtime/pprof time ) // DumpHeap 导出堆内存 func DumpHeap(filename string) error { f, err : os.Create(filename) if err ! nil { return err } defer f.Close() return pprof.Lookup(heap).WriteTo(f, 0) } // AnalyzeMemory 分析内存使用 func AnalyzeMemory() string { var m runtime.MemStats runtime.ReadMemStats(m) return fmt.Sprintf( 内存分析: Alloc: %s TotalAlloc: %s Sys: %s 堆内存: HeapAlloc: %s HeapSys: %s HeapIdle: %s HeapInuse: %s GC: NumGC: %d NextGC: %s GCCPUFraction: %.2f%% , formatBytes(m.Alloc), formatBytes(m.TotalAlloc), formatBytes(m.Sys), formatBytes(m.HeapAlloc), formatBytes(m.HeapSys), formatBytes(m.HeapIdle), formatBytes(m.HeapInuse), m.NumGC, formatBytes(m.NextGC), float64(m.GCCPUFraction)*100, ) } func formatBytes(bytes uint64) string { const unit 1024 if bytes unit { return fmt.Sprintf(%d B, bytes) } div, exp : uint64(unit), 0 for n : bytes / unit; n unit; n / unit { div * unit exp } return fmt.Sprintf(%.1f %cB, float64(bytes)/float64(div), KMGTPE[exp]) }5.3 Goroutine泄漏排查package diagnose import ( fmt runtime runtime/pprof strings time ) // AnalyzeGoroutines 分析goroutine func AnalyzeGoroutines() string { var buf strings.Builder // 获取goroutine数量 n : runtime.NumGoroutine() buf.WriteString(fmt.Sprintf(当前Goroutine数量: %d\n\n, n)) // 获取goroutine堆栈 pprof.Lookup(goroutine).WriteTo(buf, 1) return buf.String() } // MonitorGoroutineLeak 监控goroutine泄漏 func MonitorGoroutineLeak(threshold int, interval time.Duration) { baseline : runtime.NumGoroutine() ticker : time.NewTicker(interval) defer ticker.Stop() for range ticker.C { current : runtime.NumGoroutine() if current baselinethreshold { fmt.Printf(警告: Goroutine数量从 %d 增长到 %d\n, baseline, current) fmt.Println(AnalyzeGoroutines()) } } }5.4 数据库连接池问题package diagnose import ( database/sql fmt _ github.com/go-sql-driver/mysql ) // DBPoolStats 数据库连接池状态 type DBPoolStats struct { MaxOpenConnections int OpenConnections int InUse int Idle int WaitCount int64 WaitDuration int64 MaxIdleClosed int64 MaxLifetimeClosed int64 } // GetDBPoolStats 获取连接池状态 func GetDBPoolStats(db *sql.DB) *DBPoolStats { stats : db.Stats() return DBPoolStats{ MaxOpenConnections: stats.MaxOpenConnections, OpenConnections: stats.OpenConnections, InUse: stats.InUse, Idle: stats.Idle, WaitCount: stats.WaitCount, WaitDuration: stats.WaitDuration.Nanoseconds(), MaxIdleClosed: stats.MaxIdleClosed, MaxLifetimeClosed: stats.MaxLifetimeClosed, } } // AnalyzeDBPool 分析连接池 func AnalyzeDBPool(stats *DBPoolStats) string { usage : float64(stats.InUse) / float64(stats.MaxOpenConnections) * 100 return fmt.Sprintf( 数据库连接池分析: 最大连接数: %d 当前连接数: %d 使用中: %d 空闲: %d 使用率: %.1f%% 等待统计: 等待次数: %d 等待时间: %dms 关闭的最大空闲连接数: %d 关闭的最大生命周期连接数: %d , stats.MaxOpenConnections, stats.OpenConnections, stats.InUse, stats.Idle, usage, stats.WaitCount, stats.WaitDuration / 1000000, stats.MaxIdleClosed, stats.MaxLifetimeClosed, ) }5.5 网络超时问题package diagnose import ( context net time ) // PingResult Ping结果 type PingResult struct { Target string Success bool Duration time.Duration Error error } // TCPPing TCP Ping func TCPPing(target string, port int, timeout time.Duration) *PingResult { addr : fmt.Sprintf(%s:%d, target, port) start : time.Now() conn, err : net.DialTimeout(tcp, addr, timeout) duration : time.Since(start) if err ! nil { return PingResult{ Target: target, Success: false, Duration: duration, Error: err, } } defer conn.Close() return PingResult{ Target: target, Success: true, Duration: duration, } } // DNSLookup DNS查询 func DNSLookup(hostname string) ([]string, error) { addrs, err : net.LookupHost(hostname) if err ! nil { return nil, err } return addrs, nil }6. 故障复盘流程6.1 复盘报告模板# 故障复盘报告 ## 基本信息 - 故障时间: YYYY-MM-DD HH:MM - HH:MM - 持续时长: XX分钟 - 影响范围: XXX服务/XXX用户 - 严重程度: P0/P1/P2/P3 ## 故障描述 简要描述故障现象 ## 故障原因 详细描述根本原因 ## 故障处理过程 | 时间 | 动作 | 负责人 | |------|------|--------| | HH:MM | 发现告警 | xxx | | HH:MM | 开始排查 | xxx | | HH:MM | 定位问题 | xxx | | HH:MM | 开始修复 | xxx | | HH:MM | 验证修复 | xxx | | HH:MM | 故障恢复 | xxx | ## 影响分析 - 影响用户数: XXX - 订单损失: XXX - 其他影响: XXX ## 根因分析 [使用5Why分析方法] ## 改进措施 | 措施 | 负责人 | 完成时间 | 状态 | |------|--------|----------|------| | 1. | xxx | YYYY-MM-DD | 待完成 | | 2. | xxx | YYYY-MM-DD | 待完成 | ## 经验总结 - 做得好的: - 需要改进的:6.2 快速恢复命令#!/bin/bash # emergency_recovery.sh - 紧急恢复脚本 # 1. 查看服务状态 kubectl get pods -n production # 2. 查看日志 kubectl logs -f deployment/myapp -n production --tail100 # 3. 重启服务 kubectl rollout restart deployment/myapp -n production # 4. 回滚到上一个版本 kubectl rollout undo deployment/myapp -n production # 5. 扩缩容 kubectl scale deployment/myapp --replicas5 -n production # 6. 进入Pod调试 kubectl exec -it deployment/myapp -n production -- /bin/sh # 7. 查看资源使用 kubectl top pods -n production kubectl top nodes # 8. 端口转发进行本地调试 kubectl port-forward deployment/myapp 8080:8080 -n production6.3 健康检查脚本package health import ( context database/sql fmt net net/http time ) // HealthChecker 健康检查器 type HealthChecker struct { db *sql.DB redis *RedisClient } type HealthStatus struct { Overall string json:overall Checks map[string]CheckResult json:checks Timestamp time.Time json:timestamp } type CheckResult struct { Status string json:status Latency string json:latency,omitempty Error string json:error,omitempty } func (h *HealthChecker) Check() *HealthStatus { ctx, cancel : context.WithTimeout(context.Background(), 5*time.Second) defer cancel() status : HealthStatus{ Checks: make(map[string]CheckResult), Timestamp: time.Now(), } allHealthy : true // 检查数据库 if err : h.checkDatabase(ctx); err ! nil { status.Checks[database] CheckResult{Status: unhealthy, Error: err.Error()} allHealthy false } else { status.Checks[database] CheckResult{Status: healthy} } // 检查Redis if err : h.checkRedis(ctx); err ! nil { status.Checks[redis] CheckResult{Status: unhealthy, Error: err.Error()} allHealthy false } else { status.Checks[redis] CheckResult{Status: healthy} } // 检查网络 if err : h.checkNetwork(); err ! nil { status.Checks[network] CheckResult{Status: degraded, Error: err.Error()} } else { status.Checks[network] CheckResult{Status: healthy} } if allHealthy { status.Overall healthy } else { status.Overall unhealthy } return status } func (h *HealthChecker) checkDatabase(ctx context.Context) error { start : time.Now() err : h.db.PingContext(ctx) if err ! nil { return err } return nil } func (h *HealthChecker) checkRedis(ctx context.Context) error { start : time.Now() err : h.redis.Ping(ctx).Err() if err ! nil { return err } return nil } func (h *HealthChecker) checkNetwork() error { addrs : []string{8.8.8.8:53, 114.114.114.114:53} for _, addr : range addrs { conn, err : net.DialTimeout(tcp, addr, 2*time.Second) if err nil { conn.Close() return nil } } return fmt.Errorf(network unreachable) }7. 总结故障排查是一项需要不断练习和总结的技能日志分析结构化日志和关联ID是排查的基础善用日志查询工具pprof分析CPU和内存profile能快速定位性能瓶颈Trace追踪分布式环境下链路追踪是排查跨服务问题的利器常见问题处理建立常见问题的检查清单提高排查效率复盘流程每次故障都是改进的机会文档化和知识沉淀同样重要遇到故障时保持冷静按照系统化的流程排查往往比盲目尝试更有效率。