Skip to content

📊 Ledger 运维监控

本文档提供 Ledger 账本服务的完整监控和运维指南,包括指标收集、日志管理、告警配置和故障排除。

🎯 监控架构

监控体系概览

核心监控指标

指标类型监控维度关键指标
服务指标性能QPS、延迟、错误率
业务指标功能交易数量、账本数量
基础设施资源CPU、内存、磁盘
数据库存储连接数、查询性能

📈 Prometheus 监控

指标定义

服务级别指标

go
// internal/pkg/metrics/metrics.go
var (
    // HTTP 请求指标
    httpRequestsTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "ledger_http_requests_total",
            Help: "Total number of HTTP requests",
        },
        []string{"method", "endpoint", "status"},
    )
    
    // HTTP 请求延迟
    httpRequestDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "ledger_http_request_duration_seconds",
            Help:    "HTTP request duration in seconds",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "endpoint"},
    )
    
    // gRPC 请求指标
    grpcRequestsTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "ledger_grpc_requests_total",
            Help: "Total number of gRPC requests",
        },
        []string{"service", "method", "status"},
    )
    
    // 数据库连接池指标
    dbConnectionsActive = prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "ledger_db_connections_active",
            Help: "Number of active database connections",
        },
    )
    
    // 业务指标
    transactionsTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "ledger_transactions_total",
            Help: "Total number of transactions processed",
        },
        []string{"type", "status"},
    )
)

自定义业务指标

go
// 账本相关指标
var (
    ledgersTotal = prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "ledger_ledgers_total",
            Help: "Total number of ledgers",
        },
    )
    
    budgetUsage = prometheus.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "ledger_budget_usage_percentage",
            Help: "Budget usage percentage",
        },
        []string{"ledger_id", "budget_id"},
    )
    
    processedTasksTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "ledger_processed_tasks_total",
            Help: "Total number of processed tasks",
        },
        []string{"task_type", "status"},
    )
)

Prometheus 配置

服务发现配置

yaml
# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "alert_rules.yml"

scrape_configs:
  # Ledger 服务监控
  - job_name: 'ledger-service'
    static_configs:
      - targets: ['ledger-service:9090']
    metrics_path: '/metrics'
    scrape_interval: 10s
    
  # Kubernetes 服务发现
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

📊 Grafana 仪表板

服务概览仪表板

核心指标面板

json
{
  "dashboard": {
    "title": "Ledger Service Overview",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(ledger_http_requests_total[5m])",
            "legendFormat": "{{method}} {{endpoint}}"
          }
        ]
      },
      {
        "title": "Response Time",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(ledger_http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "singlestat",
        "targets": [
          {
            "expr": "rate(ledger_http_requests_total{status=~\"5..\"}[5m]) / rate(ledger_http_requests_total[5m]) * 100"
          }
        ]
      }
    ]
  }
}

业务指标仪表板

财务数据监控

json
{
  "panels": [
    {
      "title": "Daily Transactions",
      "type": "graph",
      "targets": [
        {
          "expr": "increase(ledger_transactions_total[1d])",
          "legendFormat": "{{type}}"
        }
      ]
    },
    {
      "title": "Active Ledgers",
      "type": "singlestat",
      "targets": [
        {
          "expr": "ledger_ledgers_total"
        }
      ]
    },
    {
      "title": "Budget Usage",
      "type": "table",
      "targets": [
        {
          "expr": "ledger_budget_usage_percentage",
          "format": "table"
        }
      ]
    }
  ]
}

📝 日志管理

结构化日志配置

日志格式标准

go
// internal/pkg/logger/logger.go
type Logger struct {
    logger *zap.Logger
}

func NewLogger(level string) (*Logger, error) {
    config := zap.NewProductionConfig()
    config.Level = zap.NewAtomicLevelAt(getLogLevel(level))
    
    // 自定义字段
    config.InitialFields = map[string]interface{}{
        "service": "ledger",
        "version": version.GetVersion(),
    }
    
    logger, err := config.Build()
    if err != nil {
        return nil, err
    }
    
    return &Logger{logger: logger}, nil
}

func (l *Logger) Info(msg string, fields ...zap.Field) {
    l.logger.Info(msg, fields...)
}

func (l *Logger) Error(msg string, err error, fields ...zap.Field) {
    allFields := append(fields, zap.Error(err))
    l.logger.Error(msg, allFields...)
}

// 业务日志记录
func (l *Logger) LogTransaction(ctx context.Context, tx *Transaction, action string) {
    l.logger.Info("transaction_operation",
        zap.String("action", action),
        zap.String("transaction_id", tx.ID),
        zap.String("ledger_id", tx.LedgerID),
        zap.String("type", tx.Type.String()),
        zap.Int64("amount", tx.Amount.Amount),
        zap.String("user_id", getUserID(ctx)),
        zap.Time("timestamp", time.Now()),
    )
}

ELK Stack 集成

Filebeat 配置

yaml
# filebeat.yml
filebeat.inputs:
- type: container
  paths:
    - '/var/lib/docker/containers/*/*.log'
  processors:
  - add_kubernetes_metadata:
      host: ${NODE_NAME}
      matchers:
      - logs_path:
          logs_path: "/var/lib/docker/containers/"

output.elasticsearch:
  hosts: ["elasticsearch:9200"]
  index: "ledger-logs-%{+yyyy.MM.dd}"

setup.template.name: "ledger-logs"
setup.template.pattern: "ledger-logs-*"

Logstash 处理管道

ruby
# logstash.conf
input {
  beats {
    port => 5044
  }
}

filter {
  if [kubernetes][labels][app] == "ledger" {
    json {
      source => "message"
    }
    
    date {
      match => [ "timestamp", "ISO8601" ]
    }
    
    mutate {
      add_field => { "service" => "ledger" }
    }
  }
}

output {
  elasticsearch {
    hosts => ["elasticsearch:9200"]
    index => "ledger-logs-%{+YYYY.MM.dd}"
  }
}

🚨 告警配置

Prometheus 告警规则

服务可用性告警

yaml
# alert_rules.yml
groups:
- name: ledger-service
  rules:
  # 服务下线告警
  - alert: LedgerServiceDown
    expr: up{job="ledger-service"} == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Ledger service is down"
      description: "Ledger service has been down for more than 1 minute"
  
  # 高错误率告警
  - alert: HighErrorRate
    expr: rate(ledger_http_requests_total{status=~"5.."}[5m]) / rate(ledger_http_requests_total[5m]) > 0.05
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High error rate detected"
      description: "Error rate is {{ $value | humanizePercentage }} for 5 minutes"
  
  # 响应时间告警
  - alert: HighResponseTime
    expr: histogram_quantile(0.95, rate(ledger_http_request_duration_seconds_bucket[5m])) > 2
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "High response time"
      description: "95th percentile response time is {{ $value }}s"

- name: ledger-business
  rules:
  # 业务异常告警
  - alert: TransactionProcessingFailure
    expr: rate(ledger_transactions_total{status="failed"}[10m]) > 0.1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "High transaction failure rate"
      description: "Transaction failure rate is {{ $value }} per second"
  
  # 数据库连接告警
  - alert: DatabaseConnectionIssue
    expr: ledger_db_connections_active / ledger_db_connections_max > 0.8
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Database connection pool usage high"
      description: "Database connection usage is {{ $value | humanizePercentage }}"

AlertManager 配置

通知路由配置

yaml
# alertmanager.yml
global:
  smtp_smarthost: 'smtp.gmail.com:587'
  smtp_from: 'alerts@yourcompany.com'

route:
  group_by: ['alertname', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
  - match:
      severity: critical
    receiver: 'critical-alerts'
  - match:
      severity: warning
    receiver: 'warning-alerts'

receivers:
- name: 'default'
  slack_configs:
  - api_url: 'YOUR_SLACK_WEBHOOK_URL'
    channel: '#alerts'
    title: 'Ledger Service Alert'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'critical-alerts'
  email_configs:
  - to: 'team@yourcompany.com'
    subject: 'CRITICAL: Ledger Service Alert'
    body: |
      {{ range .Alerts }}
      Alert: {{ .Annotations.summary }}
      Description: {{ .Annotations.description }}
      {{ end }}
  slack_configs:
  - api_url: 'YOUR_SLACK_WEBHOOK_URL'
    channel: '#critical-alerts'
    title: 'CRITICAL: Ledger Service'
    color: 'danger'

🔍 分布式追踪

Jaeger 集成

追踪配置

go
// internal/pkg/tracing/tracer.go
func InitTracer(serviceName string) (opentracing.Tracer, io.Closer, error) {
    cfg := jaegerconfig.Configuration{
        ServiceName: serviceName,
        Sampler: &jaegerconfig.SamplerConfig{
            Type:  jaeger.SamplerTypeConst,
            Param: 1,
        },
        Reporter: &jaegerconfig.ReporterConfig{
            LogSpans: true,
            BufferFlushInterval: 1 * time.Second,
            LocalAgentHostPort: "jaeger-agent:6831",
        },
    }
    
    tracer, closer, err := cfg.NewTracer()
    if err != nil {
        return nil, nil, err
    }
    
    opentracing.SetGlobalTracer(tracer)
    return tracer, closer, nil
}

// 服务中的追踪使用
func (s *LedgerService) CreateTransaction(ctx context.Context, req *CreateTransactionRequest) (*Transaction, error) {
    span, ctx := opentracing.StartSpanFromContext(ctx, "CreateTransaction")
    defer span.Finish()
    
    span.SetTag("ledger_id", req.LedgerID)
    span.SetTag("transaction_type", req.Type.String())
    
    // 业务逻辑...
    
    return transaction, nil
}

📱 运维脚本

健康检查脚本

bash
#!/bin/bash
# scripts/health_check.sh

SERVICE_URL="http://localhost:8080"
TIMEOUT=10

echo "Checking Ledger service health..."

# 检查基础健康状态
if curl -f -s --max-time $TIMEOUT "$SERVICE_URL/health" > /dev/null; then
    echo "✅ Health check passed"
else
    echo "❌ Health check failed"
    exit 1
fi

# 检查就绪状态
if curl -f -s --max-time $TIMEOUT "$SERVICE_URL/ready" > /dev/null; then
    echo "✅ Readiness check passed"
else
    echo "❌ Readiness check failed"
    exit 1
fi

# 检查指标端点
if curl -f -s --max-time $TIMEOUT "$SERVICE_URL/metrics" > /dev/null; then
    echo "✅ Metrics endpoint accessible"
else
    echo "❌ Metrics endpoint failed"
    exit 1
fi

echo "All checks passed!"

性能基准测试

bash
#!/bin/bash
# scripts/performance_test.sh

echo "Running performance benchmarks..."

# Go 基准测试
echo "1. Running Go benchmarks..."
go test -bench=. -benchmem ./...

# API 性能测试
echo "2. Running API performance tests..."
ab -n 1000 -c 10 http://localhost:8080/api/v1/ledgers

# 数据库性能测试
echo "3. Running database performance tests..."
go test -tags=integration -run=BenchmarkDatabase ./internal/app/ledger/

echo "Performance tests completed!"

🔧 故障排除

常见问题诊断

1. 性能问题排查

bash
# 查看 CPU 和内存使用
kubectl top pods -l app=ledger

# 查看详细资源使用
kubectl describe pod <pod-name>

# 分析慢查询
kubectl exec -it <pod-name> -- /bin/sh
tail -f /var/log/mysql/slow.log

2. 连接问题排查

bash
# 检查网络连接
kubectl exec -it <pod-name> -- nc -zv mysql-service 3306

# 查看服务发现
kubectl get endpoints ledger-service

# 检查 DNS 解析
kubectl exec -it <pod-name> -- nslookup mysql-service

3. 日志分析

bash
# 查看错误日志
kubectl logs -l app=ledger | grep ERROR

# 查看特定时间段日志
kubectl logs --since=1h -l app=ledger

# 查看容器启动日志
kubectl logs <pod-name> --previous

📚 相关文档


💡 监控建议: 建立完善的监控体系是保证服务稳定运行的关键。要定期回顾和优化监控指标,确保能够及时发现和解决问题。

基于 MIT 许可证发布