可观测性——让AI不再"黑盒",让运维不再"猜谜"
凌晨2点43分,我的Agent突然开始疯狂重试。日志里只有一行"Error: something went wrong",然后就是一堆堆栈信息。花了3小时才定位到是一个API限流问题。
Observability(可观测性)区别于传统监控——它不只是告诉你"系统挂了",而是帮你理解"为什么挂了"。OpenClaw提供三支柱:Logs(日志)、Metrics(指标)、Traces(追踪),让Agent的每一次决策都透明可见。
# SOUL.md 日志配置
logging:
format: json # 结构化输出
level: info
fields:
- agent_id
- session_id
- skill_name
- tool_name
- duration_ms
- tokens_used
# 输出示例
{
"timestamp": "2026-05-07T01:43:22.123Z",
"level": "error",
"agent_id": "miaoquai_ops",
"session_id": "sess_abc123",
"skill": "web_fetch",
"message": "Rate limit exceeded",
"retry_count": 3,
"backoff_ms": 5000
}
# 智能日志级别
logging:
levels:
- name: debug
enabled: false # 生产环境关闭
- name: info
enabled: true
sampling: 0.1 # 10%采样率
- name: warn
enabled: true
- name: error
enabled: true
alert: true # 错误日志触发告警
# OpenClaw 内置指标
metrics:
# 执行指标
- name: agent.execution.count
type: counter
labels: [agent_id, skill_name, status]
- name: agent.execution.duration
type: histogram
labels: [agent_id, skill_name]
buckets: [100, 500, 1000, 5000, 10000]
# Token指标
- name: agent.tokens.used
type: counter
labels: [agent_id, model_name]
- name: agent.tokens.cost
type: counter
labels: [agent_id, model_name]
# 错误指标
- name: agent.errors.count
type: counter
labels: [agent_id, error_type]
# SKILLS/my_skill.md 自定义指标
metrics:
custom:
- name: skill_llm_calls
type: counter
description: "LLM调用次数"
- name: skill_cache_hits
type: counter
description: "缓存命中次数"
# 追踪配置
tracing:
enabled: true
sampling_rate: 0.1 # 10%请求追踪
# 追踪级别
levels:
- name: session
capture: [input, output, duration]
- name: skill
capture: [input, output, duration, tokens]
- name: tool
capture: [input, output, duration, error]
- name: llm
capture: [prompt, completion, tokens, cost]
# 导出配置
exporter:
type: otlp
endpoint: http://localhost:4317
# 一个请求的完整追踪链
Session (sess_abc123) 5000ms
├── Skill: web_search 800ms
│ ├── Tool: brave_search 750ms
│ └── LLM: query_rewrite 50ms
├── Skill: web_fetch 3000ms
│ ├── Tool: fetch_url 2800ms
│ └── LLM: extract_content 200ms
└── Skill: summarize 1200ms
└── LLM: gpt4_summary 1200ms
# 告警规则配置
alerts:
rules:
- name: high_error_rate
condition: "rate(agent.errors.count[5m]) > 0.05"
severity: critical
message: "Agent错误率超过5%"
- name: high_latency
condition: "histogram_quantile(0.99, agent.execution.duration) > 10000"
severity: warning
message: "P99延迟超过10秒"
- name: high_token_usage
condition: "rate(agent.tokens.used[1h]) > 100000"
severity: warning
message: "Token使用量异常"
# 通知渠道
notifications:
- type: feishu
webhook: "${FEISHU_WEBHOOK}"
- type: email
recipients: ["admin@miaoquai.com"]
namespace.unit.action 格式