📊 Ledger 运维监控
本文档提供 Ledger 账本服务的完整监控和运维指南,包括指标收集、日志管理、告警配置和故障排除。
🎯 监控架构
监控体系概览
核心监控指标
| 指标类型 | 监控维度 | 关键指标 |
|---|---|---|
| 服务指标 | 性能 | QPS、延迟、错误率 |
| 业务指标 | 功能 | 交易数量、账本数量 |
| 基础设施 | 资源 | CPU、内存、磁盘 |
| 数据库 | 存储 | 连接数、查询性能 |
📈 Prometheus 监控
指标定义
服务级别指标
go
// internal/pkg/metrics/metrics.go
var (
// HTTP 请求指标
httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "ledger_http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
// HTTP 请求延迟
httpRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "ledger_http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint"},
)
// gRPC 请求指标
grpcRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "ledger_grpc_requests_total",
Help: "Total number of gRPC requests",
},
[]string{"service", "method", "status"},
)
// 数据库连接池指标
dbConnectionsActive = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "ledger_db_connections_active",
Help: "Number of active database connections",
},
)
// 业务指标
transactionsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "ledger_transactions_total",
Help: "Total number of transactions processed",
},
[]string{"type", "status"},
)
)自定义业务指标
go
// 账本相关指标
var (
ledgersTotal = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "ledger_ledgers_total",
Help: "Total number of ledgers",
},
)
budgetUsage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "ledger_budget_usage_percentage",
Help: "Budget usage percentage",
},
[]string{"ledger_id", "budget_id"},
)
processedTasksTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "ledger_processed_tasks_total",
Help: "Total number of processed tasks",
},
[]string{"task_type", "status"},
)
)Prometheus 配置
服务发现配置
yaml
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "alert_rules.yml"
scrape_configs:
# Ledger 服务监控
- job_name: 'ledger-service'
static_configs:
- targets: ['ledger-service:9090']
metrics_path: '/metrics'
scrape_interval: 10s
# Kubernetes 服务发现
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093📊 Grafana 仪表板
服务概览仪表板
核心指标面板
json
{
"dashboard": {
"title": "Ledger Service Overview",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(ledger_http_requests_total[5m])",
"legendFormat": "{{method}} {{endpoint}}"
}
]
},
{
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(ledger_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
}
]
},
{
"title": "Error Rate",
"type": "singlestat",
"targets": [
{
"expr": "rate(ledger_http_requests_total{status=~\"5..\"}[5m]) / rate(ledger_http_requests_total[5m]) * 100"
}
]
}
]
}
}业务指标仪表板
财务数据监控
json
{
"panels": [
{
"title": "Daily Transactions",
"type": "graph",
"targets": [
{
"expr": "increase(ledger_transactions_total[1d])",
"legendFormat": "{{type}}"
}
]
},
{
"title": "Active Ledgers",
"type": "singlestat",
"targets": [
{
"expr": "ledger_ledgers_total"
}
]
},
{
"title": "Budget Usage",
"type": "table",
"targets": [
{
"expr": "ledger_budget_usage_percentage",
"format": "table"
}
]
}
]
}📝 日志管理
结构化日志配置
日志格式标准
go
// internal/pkg/logger/logger.go
type Logger struct {
logger *zap.Logger
}
func NewLogger(level string) (*Logger, error) {
config := zap.NewProductionConfig()
config.Level = zap.NewAtomicLevelAt(getLogLevel(level))
// 自定义字段
config.InitialFields = map[string]interface{}{
"service": "ledger",
"version": version.GetVersion(),
}
logger, err := config.Build()
if err != nil {
return nil, err
}
return &Logger{logger: logger}, nil
}
func (l *Logger) Info(msg string, fields ...zap.Field) {
l.logger.Info(msg, fields...)
}
func (l *Logger) Error(msg string, err error, fields ...zap.Field) {
allFields := append(fields, zap.Error(err))
l.logger.Error(msg, allFields...)
}
// 业务日志记录
func (l *Logger) LogTransaction(ctx context.Context, tx *Transaction, action string) {
l.logger.Info("transaction_operation",
zap.String("action", action),
zap.String("transaction_id", tx.ID),
zap.String("ledger_id", tx.LedgerID),
zap.String("type", tx.Type.String()),
zap.Int64("amount", tx.Amount.Amount),
zap.String("user_id", getUserID(ctx)),
zap.Time("timestamp", time.Now()),
)
}ELK Stack 集成
Filebeat 配置
yaml
# filebeat.yml
filebeat.inputs:
- type: container
paths:
- '/var/lib/docker/containers/*/*.log'
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
matchers:
- logs_path:
logs_path: "/var/lib/docker/containers/"
output.elasticsearch:
hosts: ["elasticsearch:9200"]
index: "ledger-logs-%{+yyyy.MM.dd}"
setup.template.name: "ledger-logs"
setup.template.pattern: "ledger-logs-*"Logstash 处理管道
ruby
# logstash.conf
input {
beats {
port => 5044
}
}
filter {
if [kubernetes][labels][app] == "ledger" {
json {
source => "message"
}
date {
match => [ "timestamp", "ISO8601" ]
}
mutate {
add_field => { "service" => "ledger" }
}
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "ledger-logs-%{+YYYY.MM.dd}"
}
}🚨 告警配置
Prometheus 告警规则
服务可用性告警
yaml
# alert_rules.yml
groups:
- name: ledger-service
rules:
# 服务下线告警
- alert: LedgerServiceDown
expr: up{job="ledger-service"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Ledger service is down"
description: "Ledger service has been down for more than 1 minute"
# 高错误率告警
- alert: HighErrorRate
expr: rate(ledger_http_requests_total{status=~"5.."}[5m]) / rate(ledger_http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for 5 minutes"
# 响应时间告警
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(ledger_http_request_duration_seconds_bucket[5m])) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High response time"
description: "95th percentile response time is {{ $value }}s"
- name: ledger-business
rules:
# 业务异常告警
- alert: TransactionProcessingFailure
expr: rate(ledger_transactions_total{status="failed"}[10m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "High transaction failure rate"
description: "Transaction failure rate is {{ $value }} per second"
# 数据库连接告警
- alert: DatabaseConnectionIssue
expr: ledger_db_connections_active / ledger_db_connections_max > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "Database connection pool usage high"
description: "Database connection usage is {{ $value | humanizePercentage }}"AlertManager 配置
通知路由配置
yaml
# alertmanager.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@yourcompany.com'
route:
group_by: ['alertname', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'default'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#alerts'
title: 'Ledger Service Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
- name: 'critical-alerts'
email_configs:
- to: 'team@yourcompany.com'
subject: 'CRITICAL: Ledger Service Alert'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#critical-alerts'
title: 'CRITICAL: Ledger Service'
color: 'danger'🔍 分布式追踪
Jaeger 集成
追踪配置
go
// internal/pkg/tracing/tracer.go
func InitTracer(serviceName string) (opentracing.Tracer, io.Closer, error) {
cfg := jaegerconfig.Configuration{
ServiceName: serviceName,
Sampler: &jaegerconfig.SamplerConfig{
Type: jaeger.SamplerTypeConst,
Param: 1,
},
Reporter: &jaegerconfig.ReporterConfig{
LogSpans: true,
BufferFlushInterval: 1 * time.Second,
LocalAgentHostPort: "jaeger-agent:6831",
},
}
tracer, closer, err := cfg.NewTracer()
if err != nil {
return nil, nil, err
}
opentracing.SetGlobalTracer(tracer)
return tracer, closer, nil
}
// 服务中的追踪使用
func (s *LedgerService) CreateTransaction(ctx context.Context, req *CreateTransactionRequest) (*Transaction, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "CreateTransaction")
defer span.Finish()
span.SetTag("ledger_id", req.LedgerID)
span.SetTag("transaction_type", req.Type.String())
// 业务逻辑...
return transaction, nil
}📱 运维脚本
健康检查脚本
bash
#!/bin/bash
# scripts/health_check.sh
SERVICE_URL="http://localhost:8080"
TIMEOUT=10
echo "Checking Ledger service health..."
# 检查基础健康状态
if curl -f -s --max-time $TIMEOUT "$SERVICE_URL/health" > /dev/null; then
echo "✅ Health check passed"
else
echo "❌ Health check failed"
exit 1
fi
# 检查就绪状态
if curl -f -s --max-time $TIMEOUT "$SERVICE_URL/ready" > /dev/null; then
echo "✅ Readiness check passed"
else
echo "❌ Readiness check failed"
exit 1
fi
# 检查指标端点
if curl -f -s --max-time $TIMEOUT "$SERVICE_URL/metrics" > /dev/null; then
echo "✅ Metrics endpoint accessible"
else
echo "❌ Metrics endpoint failed"
exit 1
fi
echo "All checks passed!"性能基准测试
bash
#!/bin/bash
# scripts/performance_test.sh
echo "Running performance benchmarks..."
# Go 基准测试
echo "1. Running Go benchmarks..."
go test -bench=. -benchmem ./...
# API 性能测试
echo "2. Running API performance tests..."
ab -n 1000 -c 10 http://localhost:8080/api/v1/ledgers
# 数据库性能测试
echo "3. Running database performance tests..."
go test -tags=integration -run=BenchmarkDatabase ./internal/app/ledger/
echo "Performance tests completed!"🔧 故障排除
常见问题诊断
1. 性能问题排查
bash
# 查看 CPU 和内存使用
kubectl top pods -l app=ledger
# 查看详细资源使用
kubectl describe pod <pod-name>
# 分析慢查询
kubectl exec -it <pod-name> -- /bin/sh
tail -f /var/log/mysql/slow.log2. 连接问题排查
bash
# 检查网络连接
kubectl exec -it <pod-name> -- nc -zv mysql-service 3306
# 查看服务发现
kubectl get endpoints ledger-service
# 检查 DNS 解析
kubectl exec -it <pod-name> -- nslookup mysql-service3. 日志分析
bash
# 查看错误日志
kubectl logs -l app=ledger | grep ERROR
# 查看特定时间段日志
kubectl logs --since=1h -l app=ledger
# 查看容器启动日志
kubectl logs <pod-name> --previous📚 相关文档
💡 监控建议: 建立完善的监控体系是保证服务稳定运行的关键。要定期回顾和优化监控指标,确保能够及时发现和解决问题。