第2229篇:文档比对系统的工程实现——合同版本差异的AI识别
2026/4/30大约 9 分钟
第2229篇:文档比对系统的工程实现——合同版本差异的AI识别
适读人群:做法律科技、合规系统、文档管理的工程师 | 阅读时长:约16分钟 | 核心价值:构建智能合同版本比对系统,精准识别条款变化、风险点和关键差异
法务部门有个永恒的痛点:合同版本管理。
甲方发来一份修改版合同,标注了"仅做少量调整"。但拿到PDF后,用Word的"修订比较"功能一看,几十处改动。有的改了数字,有的改了措辞,有的加了几个字。
每一处改动都需要仔细阅读理解,判断对己方的影响。这工作量,在复杂合同上能花掉资深律师半天时间。
我们给一家律所做了合同AI比对系统,核心价值不只是找出哪里不同(这Word本身就能做),而是理解差异的法律含义,标记风险等级,生成可读的差异摘要。
系统架构
文档解析与结构化
/**
* 合同文档解析器
* 将合同PDF解析为结构化的条款树
*/
@Service
@Slf4j
public class ContractDocumentParser {
@Autowired
private OpenAiClient openAiClient;
/**
* 解析合同为结构化格式
* 识别条款层级:章节 > 条 > 款 > 项
*/
public ContractStructure parseContract(byte[] pdfBytes, String fileName) {
// 1. 提取文本
String rawText = extractText(pdfBytes);
// 2. 结构识别:用LLM识别合同条款层级
String structurePrompt = """
请分析以下合同文本,识别其层级结构。
输出JSON格式的条款树:
{
"contractTitle": "合同名称",
"parties": ["甲方", "乙方"],
"clauses": [
{
"id": "1",
"title": "章节标题",
"level": 1,
"content": "正文内容(如果是叶节点)",
"subClauses": [
{
"id": "1.1",
"title": "条款标题",
"level": 2,
"content": "条款内容"
}
]
}
]
}
合同文本:
""" + rawText.substring(0, Math.min(rawText.length(), 8000));
String response = openAiClient.chat(structurePrompt,
ChatOptions.builder().temperature(0.0).maxTokens(4000).build());
try {
String cleaned = response.replaceAll("```json\\s*", "").replaceAll("```\\s*", "").trim();
JsonNode node = new ObjectMapper().readTree(cleaned);
ContractStructure structure = new ContractStructure();
structure.setFileName(fileName);
structure.setContractTitle(node.get("contractTitle").asText(""));
structure.setClauses(parseClauses(node.get("clauses")));
structure.setRawText(rawText);
log.info("合同解析完成: title={}, clauses={}",
structure.getContractTitle(), structure.getClauses().size());
return structure;
} catch (Exception e) {
log.error("合同结构解析失败", e);
// 降级:返回无结构的纯文本解析
return ContractStructure.ofRawText(rawText, fileName);
}
}
private String extractText(byte[] pdfBytes) {
try (PDDocument doc = PDDocument.load(pdfBytes)) {
PDFTextStripper stripper = new PDFTextStripper();
return stripper.getText(doc);
} catch (IOException e) {
throw new DocumentParseException("PDF文本提取失败", e);
}
}
private List<ContractClause> parseClauses(JsonNode clausesNode) {
List<ContractClause> clauses = new ArrayList<>();
if (clausesNode == null || !clausesNode.isArray()) return clauses;
for (JsonNode clauseNode : clausesNode) {
ContractClause clause = ContractClause.builder()
.id(clauseNode.get("id").asText())
.title(clauseNode.get("title").asText(""))
.level(clauseNode.get("level").asInt(1))
.content(clauseNode.has("content") ? clauseNode.get("content").asText(null) : null)
.subClauses(clauseNode.has("subClauses") ?
parseClauses(clauseNode.get("subClauses")) : Collections.emptyList())
.build();
clauses.add(clause);
}
return clauses;
}
}差异识别:从字符到语义
/**
* 合同差异分析服务
* 两层分析:文本差异 + 语义差异
*/
@Service
@Slf4j
public class ContractDiffAnalyzer {
@Autowired
private OpenAiClient openAiClient;
/**
* 对两个版本的合同进行完整差异分析
*/
public ContractDiffReport analyze(ContractStructure versionA,
ContractStructure versionB) {
log.info("开始合同差异分析: A={}, B={}",
versionA.getFileName(), versionB.getFileName());
List<ClauseDiff> clauseDiffs = new ArrayList<>();
// 1. 匹配条款(按ID或标题匹配)
Map<String, ContractClause> clausesA = flattenClauses(versionA.getClauses());
Map<String, ContractClause> clausesB = flattenClauses(versionB.getClauses());
Set<String> allClauseIds = new HashSet<>();
allClauseIds.addAll(clausesA.keySet());
allClauseIds.addAll(clausesB.keySet());
for (String clauseId : allClauseIds) {
ContractClause clauseA = clausesA.get(clauseId);
ContractClause clauseB = clausesB.get(clauseId);
if (clauseA == null) {
// 版本B新增的条款
clauseDiffs.add(ClauseDiff.added(clauseId, clauseB));
} else if (clauseB == null) {
// 版本B删除的条款
clauseDiffs.add(ClauseDiff.deleted(clauseId, clauseA));
} else if (!clauseA.getContent().equals(clauseB.getContent())) {
// 条款内容有变化
ClauseDiff diff = analyzeClauseChange(clauseA, clauseB);
clauseDiffs.add(diff);
}
}
// 2. 排序:按风险等级 > 条款ID
clauseDiffs.sort(Comparator
.comparing(ClauseDiff::getRiskLevel, Comparator.reverseOrder())
.thenComparing(ClauseDiff::getClauseId));
// 3. 生成整体摘要
String summary = generateDiffSummary(clauseDiffs, versionA, versionB);
ContractDiffReport report = ContractDiffReport.builder()
.versionA(versionA.getFileName())
.versionB(versionB.getFileName())
.analysisTime(Instant.now())
.clauseDiffs(clauseDiffs)
.summary(summary)
.highRiskCount(countByRisk(clauseDiffs, RiskLevel.HIGH))
.mediumRiskCount(countByRisk(clauseDiffs, RiskLevel.MEDIUM))
.lowRiskCount(countByRisk(clauseDiffs, RiskLevel.LOW))
.build();
log.info("合同差异分析完成: 高风险={}, 中风险={}, 低风险={}",
report.getHighRiskCount(), report.getMediumRiskCount(), report.getLowRiskCount());
return report;
}
/**
* 分析单个条款的变化
* 识别变化类型(数字修改/措辞变化/义务变化等)并评估风险
*/
private ClauseDiff analyzeClauseChange(ContractClause clauseA, ContractClause clauseB) {
// 1. 计算文本差异(用于显示具体改动位置)
List<TextChange> textChanges = computeTextDiff(clauseA.getContent(), clauseB.getContent());
// 2. 用LLM分析语义变化
String analysisPrompt = String.format("""
请分析以下合同条款的修改,从乙方(接受方)角度评估风险。
【修改前】
%s
【修改后】
%s
请输出JSON:
{
"changeType": "变化类型(数字变更/权利义务变化/期限变化/措辞调整/条款删除/条款新增)",
"changeSummary": "用一句话描述变化内容",
"legalImplication": "对乙方的法律意义",
"riskLevel": "HIGH/MEDIUM/LOW/NONE",
"riskReason": "风险原因(如果有)",
"recommendation": "建议处理方式"
}
""",
clauseA.getContent(),
clauseB.getContent());
String analysis = openAiClient.chat(analysisPrompt,
ChatOptions.builder().temperature(0.1).maxTokens(500).build());
try {
String cleaned = analysis.replaceAll("```json\\s*", "").replaceAll("```\\s*", "").trim();
JsonNode node = new ObjectMapper().readTree(cleaned);
return ClauseDiff.builder()
.clauseId(clauseA.getId())
.clauseTitle(clauseA.getTitle())
.diffType(DiffType.MODIFIED)
.contentBefore(clauseA.getContent())
.contentAfter(clauseB.getContent())
.textChanges(textChanges)
.changeType(node.get("changeType").asText())
.changeSummary(node.get("changeSummary").asText())
.legalImplication(node.get("legalImplication").asText())
.riskLevel(RiskLevel.valueOf(node.get("riskLevel").asText("LOW")))
.riskReason(node.has("riskReason") ? node.get("riskReason").asText(null) : null)
.recommendation(node.get("recommendation").asText())
.build();
} catch (Exception e) {
log.error("条款变化分析失败: clauseId={}", clauseA.getId(), e);
return ClauseDiff.builder()
.clauseId(clauseA.getId())
.diffType(DiffType.MODIFIED)
.contentBefore(clauseA.getContent())
.contentAfter(clauseB.getContent())
.textChanges(textChanges)
.riskLevel(RiskLevel.UNKNOWN)
.build();
}
}
/**
* Myers差分算法的简化版
* 计算两段文字的字符级差异
*/
private List<TextChange> computeTextDiff(String textA, String textB) {
List<TextChange> changes = new ArrayList<>();
// 使用 java.util 的 diff 思路:找最长公共子序列
// 生产环境推荐使用 java-diff-utils 库
String[] wordsA = textA.split("(?<=。)|(?<=,)|(?<=。)|\\s+");
String[] wordsB = textB.split("(?<=。)|(?<=,)|(?<=。)|\\s+");
int[][] lcs = computeLCS(wordsA, wordsB);
// 基于LCS还原差异
int i = wordsA.length, j = wordsB.length;
List<TextChange> reversedChanges = new ArrayList<>();
while (i > 0 || j > 0) {
if (i > 0 && j > 0 && wordsA[i-1].equals(wordsB[j-1])) {
i--; j--;
} else if (j > 0 && (i == 0 || lcs[i][j-1] >= lcs[i-1][j])) {
reversedChanges.add(TextChange.added(wordsB[j-1], j-1));
j--;
} else {
reversedChanges.add(TextChange.deleted(wordsA[i-1], i-1));
i--;
}
}
Collections.reverse(reversedChanges);
return reversedChanges;
}
private int[][] computeLCS(String[] a, String[] b) {
int m = a.length, n = b.length;
int[][] dp = new int[m+1][n+1];
for (int i = 1; i <= m; i++) {
for (int j = 1; j <= n; j++) {
if (a[i-1].equals(b[j-1])) dp[i][j] = dp[i-1][j-1] + 1;
else dp[i][j] = Math.max(dp[i-1][j], dp[i][j-1]);
}
}
return dp;
}
private String generateDiffSummary(List<ClauseDiff> diffs,
ContractStructure versionA,
ContractStructure versionB) {
long highRisk = countByRisk(diffs, RiskLevel.HIGH);
long mediumRisk = countByRisk(diffs, RiskLevel.MEDIUM);
if (diffs.isEmpty()) {
return "两版本合同内容完全相同,无任何差异。";
}
StringBuilder summary = new StringBuilder();
summary.append(String.format("共发现 %d 处差异(高风险%d处,中风险%d处,低风险%d处)。\n\n",
diffs.size(), highRisk, mediumRisk, countByRisk(diffs, RiskLevel.LOW)));
if (highRisk > 0) {
summary.append("⚠️ 高风险变化:\n");
diffs.stream()
.filter(d -> d.getRiskLevel() == RiskLevel.HIGH)
.limit(3)
.forEach(d -> summary.append("- ").append(d.getChangeSummary()).append("\n"));
}
return summary.toString();
}
private Map<String, ContractClause> flattenClauses(List<ContractClause> clauses) {
Map<String, ContractClause> flat = new LinkedHashMap<>();
flattenHelper(clauses, flat);
return flat;
}
private void flattenHelper(List<ContractClause> clauses, Map<String, ContractClause> result) {
for (ContractClause clause : clauses) {
if (clause.getContent() != null && !clause.getContent().isEmpty()) {
result.put(clause.getId(), clause);
}
if (clause.getSubClauses() != null) {
flattenHelper(clause.getSubClauses(), result);
}
}
}
private long countByRisk(List<ClauseDiff> diffs, RiskLevel level) {
return diffs.stream().filter(d -> d.getRiskLevel() == level).count();
}
}报告生成:HTML交互式差异报告
/**
* 合同差异报告生成器
* 生成包含高亮差异的HTML报告
*/
@Service
public class DiffReportGenerator {
/**
* 生成HTML格式的差异对比报告
* 包含:左右分栏对比、颜色高亮、风险标注
*/
public byte[] generateHtmlReport(ContractDiffReport report) {
StringBuilder html = new StringBuilder();
html.append("""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>合同差异报告</title>
<style>
body { font-family: "Microsoft YaHei", sans-serif; margin: 20px; }
.summary { background: #f5f5f5; padding: 15px; border-radius: 8px; margin-bottom: 20px; }
.risk-high { background-color: #ffebee; border-left: 4px solid #f44336; }
.risk-medium { background-color: #fff8e1; border-left: 4px solid #ff9800; }
.risk-low { background-color: #e8f5e9; border-left: 4px solid #4caf50; }
.diff-container { display: flex; gap: 10px; margin: 10px 0; }
.diff-left, .diff-right { flex: 1; padding: 10px; border: 1px solid #ddd; }
.added { background-color: #c8e6c9; }
.deleted { background-color: #ffcdd2; text-decoration: line-through; }
.clause-id { color: #666; font-size: 0.85em; }
.badge { padding: 2px 8px; border-radius: 4px; font-size: 0.8em; font-weight: bold; }
.badge-high { background: #f44336; color: white; }
.badge-medium { background: #ff9800; color: white; }
.badge-low { background: #4caf50; color: white; }
</style>
</head>
<body>
""");
// 报告头部
html.append("<h1>合同差异分析报告</h1>");
html.append(String.format("""
<div class="summary">
<h2>摘要</h2>
<p><strong>原版本:</strong>%s</p>
<p><strong>新版本:</strong>%s</p>
<p><strong>分析时间:</strong>%s</p>
<p><strong>差异统计:</strong>
<span class="badge badge-high">高风险 %d 处</span>
<span class="badge badge-medium">中风险 %d 处</span>
<span class="badge badge-low">低风险 %d 处</span>
</p>
<p>%s</p>
</div>
""",
escapeHtml(report.getVersionA()),
escapeHtml(report.getVersionB()),
report.getAnalysisTime().toString(),
report.getHighRiskCount(),
report.getMediumRiskCount(),
report.getLowRiskCount(),
escapeHtml(report.getSummary()).replace("\n", "<br>")));
// 各条款差异
html.append("<h2>逐条差异分析</h2>");
for (ClauseDiff diff : report.getClauseDiffs()) {
String riskClass = "risk-" + diff.getRiskLevel().name().toLowerCase();
String badgeClass = "badge-" + diff.getRiskLevel().name().toLowerCase();
html.append(String.format("""
<div class="%s" style="margin: 15px 0; padding: 15px; border-radius: 4px;">
<div style="display:flex; justify-content:space-between; align-items:center;">
<h3>%s <span class="clause-id">(%s)</span></h3>
<span class="badge %s">%s</span>
</div>
<p><strong>变化类型:</strong>%s</p>
<p><strong>变化摘要:</strong>%s</p>
<p><strong>法律意义:</strong>%s</p>
""",
riskClass,
escapeHtml(diff.getClauseTitle()),
escapeHtml(diff.getClauseId()),
badgeClass,
diff.getRiskLevel().name(),
escapeHtml(diff.getChangeType()),
escapeHtml(diff.getChangeSummary()),
escapeHtml(diff.getLegalImplication())));
// 左右对比
if (diff.getDiffType() == DiffType.MODIFIED) {
html.append("""
<div class="diff-container">
<div class="diff-left">
<h4>修改前</h4>
<p>""");
html.append(escapeHtml(diff.getContentBefore()));
html.append("""
</p>
</div>
<div class="diff-right">
<h4>修改后</h4>
<p>""");
html.append(escapeHtml(diff.getContentAfter()));
html.append("</p></div></div>");
}
if (diff.getRecommendation() != null) {
html.append(String.format("<p><strong>建议:</strong>%s</p>",
escapeHtml(diff.getRecommendation())));
}
html.append("</div>");
}
html.append("</body></html>");
return html.toString().getBytes(StandardCharsets.UTF_8);
}
private String escapeHtml(String text) {
if (text == null) return "";
return text.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace("\"", """);
}
}关键风险检测:数字变化专项扫描
合同中的数字变化最容易被忽视却最危险:
/**
* 合同数字变化专项检测器
* 专门扫描金额、期限、比例等关键数字的变化
*/
@Service
public class ContractNumberChangeDetector {
private static final List<Pattern> CRITICAL_NUMBER_PATTERNS = Arrays.asList(
Pattern.compile("(?:金额|价款|费用|报酬)[::]*\\s*(\\d[\\d,]*\\.?\\d*)\\s*(?:元|万|百万|亿)"),
Pattern.compile("(?:期限|天数|工期)[::]*\\s*(\\d+)\\s*(?:日|天|月|年)"),
Pattern.compile("(?:违约金|罚款|赔偿)[::]*\\s*(\\d[\\d,]*\\.?\\d*)%?"),
Pattern.compile("(\\d+)%\\s*(?:利率|比例|分成|提成)")
);
/**
* 扫描两版合同中的所有关键数字变化
*/
public List<NumberChange> detectNumberChanges(String textA, String textB) {
List<NumberChange> changes = new ArrayList<>();
for (Pattern pattern : CRITICAL_NUMBER_PATTERNS) {
Map<String, String> numbersA = extractNumbers(textA, pattern);
Map<String, String> numbersB = extractNumbers(textB, pattern);
// 找出有变化的数字
for (Map.Entry<String, String> entry : numbersA.entrySet()) {
String context = entry.getKey();
String valueA = entry.getValue();
String valueB = numbersB.get(context);
if (valueB != null && !valueA.equals(valueB)) {
changes.add(NumberChange.builder()
.context(context)
.valueBefore(valueA)
.valueAfter(valueB)
.isIncrease(compareNumbers(valueB, valueA) > 0)
.changePercent(calculateChangePercent(valueA, valueB))
.build());
}
}
}
return changes;
}
private Map<String, String> extractNumbers(String text, Pattern pattern) {
Map<String, String> results = new LinkedHashMap<>();
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
// 用匹配位置前后的上下文作为key
int start = Math.max(0, matcher.start() - 10);
int end = Math.min(text.length(), matcher.end() + 10);
String context = text.substring(start, end);
results.put(context, matcher.group(1));
}
return results;
}
private int compareNumbers(String a, String b) {
try {
double da = Double.parseDouble(a.replace(",", ""));
double db = Double.parseDouble(b.replace(",", ""));
return Double.compare(da, db);
} catch (NumberFormatException e) {
return 0;
}
}
private double calculateChangePercent(String before, String after) {
try {
double a = Double.parseDouble(before.replace(",", ""));
double b = Double.parseDouble(after.replace(",", ""));
return a == 0 ? 0 : (b - a) / a * 100;
} catch (NumberFormatException e) {
return 0;
}
}
}落地效果与局限性
这个系统上线后,律所反馈:
- 省时效果显著:复杂合同差异分析从半天缩短到20分钟(初步分析),律师主要时间用在验证AI结论和深度分析高风险条款
- 漏检率降低:之前人工审核偶尔会漏掉微小的数字改动(如从"5%"变"5.5%"),系统基本能捕捉
- 准确率问题:风险级别判断约85%符合律师预期,15%存在分歧(主要是中/高风险的边界判断,以及不同合同类型的语境理解)
重要局限:AI分析结论不能直接用于法律决策,必须有律师复核。系统定位是"提高律师工作效率的工具",不是"替代律师的决策系统"。
