OpenClaw Document Pipeline 文档处理流水线
2026年5月,一篇论文引爆社区:"LLMs Corrupt Your Documents When You Delegate"。当你把文档交给LLM处理时,它可能悄悄篡改数据——数字变了、名字错了、关键信息消失了。
这不是危言耸听。OpenClaw的Document Pipeline模块正是为解决这个问题而生:构建可验证、可追溯、可恢复的文档处理流水线。
为什么需要文档流水线
传统文档处理方式存在三大问题:
⚠️ LLM文档损坏风险
- 数值篡改:LLM可能"修正"它认为错误的数字
- 名称替换:人名、公司名被意外替换
- 格式丢失:表格结构、缩进层级混乱
- 幻觉内容:添加原文不存在的描述
OpenClaw Document Pipeline通过以下机制解决:
- 原文保留:所有处理保留原始文档快照
- 差异追踪:每个修改都有diff记录
- 校验机制:关键数据提取后自动比对
- 回滚能力:任意步骤可回退
核心架构
import { DocumentPipeline } from 'openclaw/document';
const pipeline = new DocumentPipeline({
name: "contract-analyzer",
// 输入阶段
ingest: {
formats: ["pdf", "docx", "html", "txt", "md"],
maxSize: "50MB",
ocr: {
enabled: true,
engine: "tesseract",
languages: ["chi_sim", "eng"]
}
},
// 解析阶段
parse: {
preserveStructure: true,
extractTables: true,
extractImages: true,
detectLanguage: true
},
// 处理阶段
process: {
chunkSize: 1000,
chunkOverlap: 200,
extractEntities: true,
summarize: false
},
// 验证阶段
validate: {
checksumVerification: true,
diffTracking: true,
humanReview: false
},
// 输出阶段
output: {
formats: ["json", "markdown"],
includeMetadata: true,
includeSource: true
}
});
实战案例:合同分析流水线
Step 1: 定义处理流程
const contractPipeline = new DocumentPipeline({
name: "contract-review",
stages: [
// 阶段1: 文档摄取
{
name: "ingest",
action: "load",
config: {
formats: ["pdf", "docx"],
ocr: { enabled: true }
}
},
// 阶段2: 结构提取
{
name: "structure",
action: "extract",
config: {
sections: true,
clauses: true,
tables: true,
signatures: true
}
},
// 阶段3: 关键信息提取
{
name: "extract-key-info",
action: "llm-extract",
config: {
model: "claude-3-opus",
schema: {
parties: "array",
effectiveDate: "date",
expirationDate: "date",
paymentTerms: "string",
liability: "string",
terminationClause: "string"
},
// 关键:启用原文校验
verifyAgainstSource: true,
markUnverified: true
}
},
// 阶段4: 风险识别
{
name: "risk-analysis",
action: "analyze",
config: {
patterns: [
"无限责任条款",
"自动续约陷阱",
"模糊赔偿条款",
"单方面解约权"
],
severity: ["high", "medium", "low"]
}
}
]
});
Step 2: 执行并追踪
const result = await contractPipeline.run("./contracts/lease-2024.pdf");
// 结果包含完整追踪信息
console.log(result);
{
documentId: "doc_abc123",
originalHash: "sha256:8f4a...",
stages: {
ingest: {
status: "success",
duration: "1.2s",
pages: 15
},
structure: {
status: "success",
sections: 8,
tables: 3,
clauses: 42
},
"extract-key-info": {
status: "partial", // 部分数据未验证
extracted: {
parties: ["甲方: 张三", "乙方: 李四"], // ✅ 已验证
effectiveDate: "2024-01-01", // ✅ 已验证
paymentTerms: "月付5000元", // ⚠️ 未验证
liability: "乙方承担全部损失" // ✅ 已验证
},
verification: {
verified: 3,
unverified: 1,
confidence: 0.87
}
},
"risk-analysis": {
status: "success",
risks: [
{ type: "无限责任", severity: "high", location: "第8条第3款" },
{ type: "自动续约", severity: "medium", location: "第12条" }
]
}
}
}
Step 3: 人工审核接口
// 获取需要审核的内容
const reviewItems = await pipeline.getReviewItems(result.documentId);
for (const item of reviewItems) {
console.log(`待审核: ${item.field}`);
console.log(`提取值: ${item.extracted}`);
console.log(`原文位置: ${item.sourceLocation}`);
console.log(`置信度: ${item.confidence}`);
// 人工确认
await pipeline.confirmField(result.documentId, item.field, {
value: item.extracted,
reviewer: "human",
verified: true
});
}
防止LLM文档损坏
校验机制
const safePipeline = new DocumentPipeline({
validation: {
// 数值校验:提取的数字必须能在原文找到
numericVerification: {
enabled: true,
tolerance: 0, // 精确匹配
onMismatch: "reject" // 不匹配则拒绝
},
// 名称校验:实体名称必须在原文存在
entityVerification: {
enabled: true,
fuzzyMatch: false,
onMismatch: "flag"
},
// 日期校验:格式化和验证
dateVerification: {
enabled: true,
formats: ["YYYY-MM-DD", "YYYY/MM/DD", "MM/DD/YYYY"],
normalize: "YYYY-MM-DD"
},
// 完整性校验
completenessCheck: {
enabled: true,
requiredFields: ["parties", "date", "terms"],
onMissing: "warn"
}
}
});
差异追踪
// 启用变更追踪
const result = await pipeline.run(doc, {
trackChanges: true,
includeDiff: true
});
// 查看所有变更
for (const change of result.changes) {
console.log(`字段: ${change.field}`);
console.log(`原文: ${change.original}`);
console.log(`输出: ${change.extracted}`);
console.log(`类型: ${change.type}`); // addition | modification | deletion
console.log(`位置: ${change.location}`);
}
批量处理
const batchProcessor = new DocumentPipeline({
name: "batch-contracts",
batch: {
maxSize: 100,
parallel: 5,
failFast: false
},
// 失败处理
errorHandling: {
retry: 3,
backoff: "exponential",
onFailure: "skip" // 或 "stop"
},
// 进度报告
progress: {
callback: (progress) => {
console.log(`处理进度: ${progress.completed}/${progress.total}`);
console.log(`成功率: ${progress.successRate}`);
}
}
});
// 批量执行
const results = await batchProcessor.runBatch([
"./docs/contract1.pdf",
"./docs/contract2.docx",
"./docs/contract3.pdf"
]);
// 汇总报告
console.log(results.summary);
// { total: 3, success: 2, failed: 1, avgDuration: "2.3s" }
最佳实践
流水线设计原则
- 分阶段处理:每个阶段职责单一,便于定位问题
- 幂等性:同一文档多次处理结果一致
- 可恢复:失败后从断点继续,不重新开始
- 可观测:每个阶段都有清晰的输入输出日志
数据安全建议
- 处理前备份原文档
- 敏感文档启用加密存储
- 设置文档访问权限
- 定期清理临时文件
性能优化
- 大文档分块并行处理
- 缓存解析结果避免重复
- OCR任务使用队列管理
- 选择性提取:只提取需要的字段
常见问题
Q: 如何处理扫描PDF?
启用OCR模块,支持Tesseract和商业OCR引擎。建议先测试OCR质量,设置合适的DPI(推荐300)。
Q: 表格提取不准确怎么办?
使用OpenClaw的表格结构化提取功能,支持复杂表格、合并单元格。对于特别复杂的表格,可启用视觉模型辅助。
Q: 如何验证LLM没有篡改数据?
启用verifyAgainstSource选项,系统会自动比对提取值与原文。所有差异都会标记并记录位置。