OpenClaw A/B Testing Agent Agent实验框架

发布时间：2026-05-10 | 阅读时长：12分钟

"换了新模型效果好像好了一点？"——这句话很危险。感觉不能替代数据。

OpenClaw A/B Testing框架让Agent优化变成科学实验：控制组vs实验组，随机分配，指标追踪，显著性检验。不是"好像好了一点"，而是"提升12.3%，p<0.05，置信区间[8.1%, 16.5%]"。

为什么Agent需要A/B测试

模型切换：Claude vs GPT，哪个更适合你的场景？
提示词优化：新提示词真的比旧的好吗？
技能添加：加了这个Skill效果有提升吗？
参数调整：温度0.7 vs 0.9，哪个更稳定？
Persona变化：用户更喜欢哪种风格？

实验框架架构

import { ABTest } from 'openclaw/experiment';

const experiment = new ABTest({
  name: "prompt-optimization-v1",
  
  // === 实验设计 ===
  design: {
    hypothesis: "新提示词比旧提示词用户满意度更高",
    
    // 分组
    groups: [
      {
        id: "control",
        name: "旧提示词",
        proportion: 0.5, // 50%用户
        config: {
          model: "claude-3-sonnet",
          systemPrompt: "旧版本提示词..."
        }
      },
      {
        id: "treatment",
        name: "新提示词",
        proportion: 0.5, // 50%用户
        config: {
          model: "claude-3-sonnet",
          systemPrompt: "新版本提示词..."
        }
      }
    ],
    
    // 分组策略
    assignmentStrategy: {
      method: "random", // 随机分配
      sticky: true,     // 同一用户始终在同一组
      userIdKey: "userId" // 分组依据
    }
  },
  
  // === 指标定义 ===
  metrics: {
    primary: {
      name: "用户满意度",
      type: "rating", // 1-5星评分
      goal: "increase"
    },
    
    secondary: [
      {
        name: "响应时间",
        type: "duration",
        goal: "decrease"
      },
      {
        name: "Token消耗",
        type: "count",
        goal: "decrease"
      },
      {
        name: "任务成功率",
        type: "rate",
        goal: "increase"
      }
    ]
  },
  
  // === 实验参数 ===
  parameters: {
    duration: "14d",     // 运行14天
    minSampleSize: 100,  // 每组至少100样本
    significanceLevel: 0.05, // p<0.05显著性
    
    // 提前停止条件
    earlyStop: {
      enabled: true,
      conditions: [
        "significance_reached", // 达到显著性
        "sample_size_reached",  // 达到样本量
        "treatment_worse"       // 实验组明显更差
      ]
    }
  }
});

分组与随机化

// 分组分配
const assignment = {
  
  // 随机分配（哈希基于用户ID）
  assignGroup: (userId: string) => {
    const hash = crc32(userId + experimentSalt) % 100;
    
    if (hash < 50) return "control";
    return "treatment";
  },
  
  // 粘性分配（同一用户始终同一组）
  stickyAssignment: {
    storage: "memory", // 或 "redis"
    ttl: "30d",
    
    get: async (userId: string) => {
      return await memory.get(`abtest:${experimentId}:${userId}`);
    },
    
    set: async (userId: string, group: string) => {
      await memory.set(`abtest:${experimentId}:${userId}`, group);
    }
  },
  
  // 分层随机化（确保各组用户特征均衡）
  stratified: {
    strata: ["user_type", "region"],
    
    // 在每个分层内随机分配
    assign: async (userId: string) => {
      const profile = await getUserProfile(userId);
      const stratum = `${profile.user_type}:${profile.region}`;
      
      // 分层内随机
      return stratifiedRandomAssign(stratum, userId);
    }
  }
};

指标追踪

// 指标收集
const metricsCollector = {
  
  // 自动追踪
  autoTrack: {
    responseTime: true,     // 自动记录响应时间
    tokenUsage: true,       // 自动记录Token消耗
    successRate: true,      // 自动记录任务成功/失败
    userFeedback: false     // 需要用户主动评分
  },
  
  // 手动追踪
  manualTrack: async (userId: string, metric: string, value: number) => {
    const group = await assignment.getGroup(userId);
    
    await metrics.record({
      experiment: experimentId,
      userId,
      group,
      metric,
      value,
      timestamp: new Date()
    });
  },
  
  // 用户反馈追踪
  trackRating: async (userId: string, rating: number) => {
    // 用户评分（1-5星）
    await metricsCollector.manualTrack(userId, "satisfaction", rating);
  }
};

统计分析

// 实验分析
const analysis = {
  
  // 获取实时结果
  getResults: async () => {
    const results = await experiment.analyze();
    
    return {
      // 每组指标
      groups: {
        control: {
          sampleSize: 152,
          satisfaction: { mean: 3.8, std: 0.92 },
          responseTime: { mean: "2.3s", median: "2.1s" },
          tokenUsage: { mean: 1250, median: 1100 }
        },
        treatment: {
          sampleSize: 148,
          satisfaction: { mean: 4.2, std: 0.88 },
          responseTime: { mean: "2.5s", median: "2.3s" },
          tokenUsage: { mean: 1350, median: 1200 }
        }
      },
      
      // 统计检验
      statistical: {
        // 主指标检验（满意度）
        primary: {
          test: "t-test",
          pValue: 0.003,
          significant: true,
          confidenceInterval: [0.28, 0.52],
          effectSize: 0.43 // Cohen's d
        },
        
        // 次指标检验
        secondary: {
          responseTime: { pValue: 0.15, significant: false },
          tokenUsage: { pValue: 0.08, significant: false }
        }
      },
      
      // 结论
      conclusion: {
        winner: "treatment",
        recommendation: "采用新提示词，满意度提升约0.4星（10.5%）",
        confidence: "95%"
      }
    };
  },
  
  // 显著性检验
  significanceTest: async (metric: string) => {
    const controlData = await metrics.getGroupData("control", metric);
    const treatmentData = await metrics.getGroupData("treatment", metric);
    
    // 根据数据类型选择检验方法
    if (metric === "satisfaction") {
      // 连续变量：t检验
      return tTest(controlData, treatmentData);
    } else if (metric === "successRate") {
      // 比率：卡方检验
      return chiSquareTest(controlData, treatmentData);
    }
  }
};

实验管理

// 实验生命周期
const lifecycle = {
  
  // 启动实验
  start: async () => {
    await experiment.start();
    
    // 通知相关方
    await notify("实验 prompt-optimization-v1 已启动");
    
    // 开始追踪
    metricsCollector.startAutoTrack();
  },
  
  // 监控实验
  monitor: async () => {
    // 每小时检查一次
    const status = await experiment.status();
    
    if (status.sampleSize >= 100 && status.significanceReached) {
      console.log("✅ 已达到显著性，可以提前结束");
      await experiment.stop();
    }
    
    if (status.daysElapsed >= 14) {
      console.log("📅 实验周期结束");
      await experiment.stop();
    }
  },
  
  // 结束实验
  stop: async () => {
    await experiment.stop();
    
    // 生成最终报告
    const report = await experiment.finalReport();
    
    // 应用获胜方案
    if (report.winner === "treatment") {
      await applyConfig(report.winningConfig);
      console.log("✅ 已应用新提示词");
    }
  },
  
  // 回滚实验
  rollback: async () => {
    // 如果实验组表现不佳，可以回滚
    await experiment.rollback();
    console.log("⚠️ 实验已回滚，继续使用控制组配置");
  }
};

实战案例：提示词优化实验

const promptExperiment = new ABTest({
  name: "妙趣风格优化",
  
  design: {
    groups: [
      {
        id: "control",
        name: "当前风格",
        proportion: 0.5,
        config: {
          persona: {
            tone: "friendly_professional",
            humorLevel: 0.6
          }
        }
      },
      {
        id: "treatment",
        name: "增强幽默",
        proportion: 0.5,
        config: {
          persona: {
            tone: "humorous",
            humorLevel: 0.8
          }
        }
      }
    ]
  },
  
  metrics: {
    primary: { name: "用户互动率", type: "rate", goal: "increase" },
    secondary: [
      { name: "转发率", type: "rate", goal: "increase" },
      { name: "投诉率", type: "rate", goal: "decrease" }
    ]
  },
  
  parameters: {
    duration: "7d",
    minSampleSize: 500
  }
});

// 运行两周后结果
const result = await promptExperiment.analyze();
// 结果：幽默风格互动率提升15%，投诉率无明显变化
// 决定：采用增强幽默风格

最佳实践

明确假设：实验前明确预期结果和成功标准
单一变量：每次只测试一个变化
足够样本：每组至少100样本，越多越好
随机分配：确保组间无系统性差异
持续监控：定期检查，及时止损
记录决策：无论结果好坏，记录决策依据

常见问题

Q: 样本量如何确定？

使用功效分析（Power Analysis）计算。一般规则：期望效果越小，需要样本越大。OpenClaw内置样本量计算器。

Q: 如何避免新功能效应？

新功能初期可能表现异常好（新奇效应）。建议延长实验周期（至少2周），观察长期表现。

科学验证Agent优化

"感觉好像好了一点"是危险的。OpenClaw A/B Testing框架让Agent优化变成科学实验：清晰的假设、严格的分组、可信的统计。用数据说话，而不是用直觉。

查看实验模板