第2063篇:文档智能——OCR+LLM实现合同/发票信息提取
2026/4/30大约 6 分钟
第2063篇:文档智能——OCR+LLM实现合同/发票信息提取
适读人群:需要处理非结构化文档的工程师 | 阅读时长:约19分钟 | 核心价值:掌握OCR+LLM的文档处理流水线,实现合同、发票、报表等文档的自动化信息提取
财务部门有个同事手工核对发票,每天要对几百张。每张发票要看:发票号码、开票日期、金额、税率、供应商信息。
我问他:这活儿为什么不自动化?他说:公司IT觉得发票识别准确率达不到要求,之前试过某OCR系统,识别后还是要人工复核。
这其实是用单独OCR解决不了的问题,但OCR+LLM组合可以解决——OCR负责"看",LLM负责"理解和提取"。
文档处理流水线
第一步:文档预处理
/**
* 文档预处理工具
* 处理不同来源的文档,统一转为文本
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class DocumentPreprocessor {
private final OcrService ocrService;
/**
* 统一的文档读取入口
* 支持PDF、图片、扫描件
*/
public ProcessedDocument process(byte[] fileContent, String fileName) {
String extension = getExtension(fileName);
return switch (extension.toLowerCase()) {
case "pdf" -> processPdf(fileContent, fileName);
case "jpg", "jpeg", "png", "bmp", "tiff" -> processImage(fileContent, fileName);
case "docx" -> processDocx(fileContent, fileName);
default -> throw new UnsupportedDocumentTypeException("不支持的文件类型: " + extension);
};
}
private ProcessedDocument processPdf(byte[] content, String fileName) {
// 判断是文字PDF还是扫描PDF
try (PDDocument doc = PDDocument.load(content)) {
PDFTextStripper stripper = new PDFTextStripper();
String extractedText = stripper.getText(doc);
// 如果提取到足够多的文字,说明是文字PDF
if (extractedText.trim().length() > 100) {
log.info("文字PDF: {}, 提取文字{}字符", fileName, extractedText.length());
return ProcessedDocument.fromText(extractedText, fileName);
}
// 否则是扫描PDF,需要OCR
log.info("扫描PDF: {}, 转换图片后OCR", fileName);
return processScannedPdf(doc, fileName);
} catch (IOException e) {
throw new DocumentProcessException("PDF处理失败: " + fileName, e);
}
}
private ProcessedDocument processScannedPdf(PDDocument doc, String fileName) {
List<String> pageTexts = new ArrayList<>();
PDFRenderer renderer = new PDFRenderer(doc);
for (int page = 0; page < doc.getNumberOfPages(); page++) {
try {
// 渲染为300DPI图片(OCR推荐分辨率)
BufferedImage image = renderer.renderImageWithDPI(page, 300);
// 图像增强(提高OCR准确率)
BufferedImage enhanced = enhanceImage(image);
// OCR识别
String pageText = ocrService.recognize(imageToBytes(enhanced));
pageTexts.add(pageText);
log.debug("第{}页OCR完成: {}字符", page + 1, pageText.length());
} catch (Exception e) {
log.warn("第{}页处理失败: {}", page + 1, e.getMessage());
pageTexts.add("");
}
}
String fullText = String.join("\n\n--- 页码分割 ---\n\n", pageTexts);
return ProcessedDocument.fromText(fullText, fileName);
}
private BufferedImage enhanceImage(BufferedImage original) {
// 灰度化
BufferedImage gray = new BufferedImage(
original.getWidth(), original.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = gray.getGraphics();
g.drawImage(original, 0, 0, null);
g.dispose();
return gray;
}
private String getExtension(String fileName) {
int dotIndex = fileName.lastIndexOf('.');
return dotIndex >= 0 ? fileName.substring(dotIndex + 1) : "";
}
@Data
@Builder
public static class ProcessedDocument {
private String fileName;
private String extractedText;
private List<String> pages;
private DocumentSource source;
public static ProcessedDocument fromText(String text, String fileName) {
return ProcessedDocument.builder()
.fileName(fileName)
.extractedText(text)
.source(DocumentSource.TEXT_EXTRACTION)
.build();
}
}
public enum DocumentSource { TEXT_EXTRACTION, OCR }
}第二步:LLM结构化提取
/**
* 基于LLM的结构化信息提取
* 支持发票、合同、报表等多种文档类型
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class StructuredExtractorService {
private final ChatLanguageModel llm;
private final ObjectMapper objectMapper;
/**
* 发票信息提取
*/
public InvoiceInfo extractInvoice(String documentText) {
String extractionPrompt = String.format("""
请从以下发票文本中提取关键信息,输出严格的JSON格式。
如果某个字段无法确定,填null。
日期格式统一为yyyy-MM-dd。
金额格式为数字(不含货币符号和千分位逗号)。
发票文本:
%s
输出JSON(只输出JSON,不要其他内容):
{
"invoiceCode": "发票代码",
"invoiceNumber": "发票号码",
"invoiceDate": "开票日期",
"sellerName": "销售方名称",
"sellerTaxId": "销售方纳税人识别号",
"buyerName": "购买方名称",
"buyerTaxId": "购买方纳税人识别号",
"totalAmount": 不含税金额数字,
"taxAmount": 税额数字,
"totalAmountWithTax": 含税总额数字,
"taxRate": 税率百分比数字如0.13,
"items": [
{
"name": "货物/服务名称",
"quantity": 数量数字,
"unit": "单位",
"unitPrice": 单价数字,
"amount": 金额数字
}
]
}
""", truncateForPrompt(documentText));
String response = llm.generate(extractionPrompt);
return parseInvoiceInfo(response);
}
/**
* 合同关键信息提取
*/
public ContractInfo extractContract(String documentText) {
String extractionPrompt = String.format("""
请从以下合同文本中提取关键信息,输出JSON格式。
合同文本:
%s
输出JSON:
{
"contractNumber": "合同编号",
"contractName": "合同名称",
"signDate": "签订日期",
"effectiveDate": "生效日期",
"expiryDate": "终止日期",
"partyA": {
"name": "甲方名称",
"representative": "法定代表人",
"address": "地址"
},
"partyB": {
"name": "乙方名称",
"representative": "法定代表人",
"address": "地址"
},
"contractAmount": 合同金额数字或null,
"currency": "货币",
"paymentTerms": "付款方式描述",
"deliveryTerms": "交付条款",
"warrantyPeriod": "质保期",
"penaltyClause": "违约责任关键条款",
"arbitrationClause": "争议解决方式"
}
""", truncateForPrompt(documentText));
String response = llm.generate(extractionPrompt);
return parseContractInfo(response);
}
/**
* 通用信息提取(自定义模板)
*/
public <T> T extractWithTemplate(
String documentText,
String extractionTemplate, // JSON Schema描述
Class<T> targetClass) {
String prompt = String.format("""
请从以下文档中提取信息,按照指定的JSON格式输出。
只输出JSON,不要其他文字。
文档内容:
%s
提取格式:
%s
""", truncateForPrompt(documentText), extractionTemplate);
String response = llm.generate(prompt);
try {
String json = extractJsonFromResponse(response);
return objectMapper.readValue(json, targetClass);
} catch (Exception e) {
log.error("信息提取解析失败: {}", e.getMessage());
throw new ExtractionException("信息提取失败", e);
}
}
private String truncateForPrompt(String text) {
// LLM的上下文有限,超长文档需要截断
int maxLength = 4000;
if (text.length() <= maxLength) return text;
// 截取前后部分,通常关键信息在头尾
int halfLength = maxLength / 2;
return text.substring(0, halfLength) +
"\n...[文档中间部分省略]...\n" +
text.substring(text.length() - halfLength);
}
private String extractJsonFromResponse(String response) {
// 处理LLM可能加的markdown代码块
String cleaned = response.trim();
if (cleaned.startsWith("```json")) {
cleaned = cleaned.substring(7);
} else if (cleaned.startsWith("```")) {
cleaned = cleaned.substring(3);
}
if (cleaned.endsWith("```")) {
cleaned = cleaned.substring(0, cleaned.length() - 3);
}
return cleaned.trim();
}
private InvoiceInfo parseInvoiceInfo(String json) {
try {
return objectMapper.readValue(extractJsonFromResponse(json), InvoiceInfo.class);
} catch (Exception e) {
log.error("发票信息解析失败: {}", e.getMessage());
return new InvoiceInfo(); // 返回空对象,由业务层处理
}
}
private ContractInfo parseContractInfo(String json) {
try {
return objectMapper.readValue(extractJsonFromResponse(json), ContractInfo.class);
} catch (Exception e) {
log.error("合同信息解析失败: {}", e.getMessage());
return new ContractInfo();
}
}
}第三步:结果验证
/**
* 提取结果的业务规则验证
*/
@Component
public class InvoiceValidator {
/**
* 验证发票信息的完整性和合法性
*/
public ValidationResult validate(InvoiceInfo invoice) {
List<String> errors = new ArrayList<>();
List<String> warnings = new ArrayList<>();
// 必填字段检查
if (isEmpty(invoice.getInvoiceNumber())) {
errors.add("发票号码不能为空");
}
if (isEmpty(invoice.getSellerName())) {
errors.add("销售方名称不能为空");
}
// 金额逻辑验证
if (invoice.getTotalAmount() != null && invoice.getTaxAmount() != null
&& invoice.getTotalAmountWithTax() != null) {
double expectedTotal = invoice.getTotalAmount() + invoice.getTaxAmount();
double diff = Math.abs(expectedTotal - invoice.getTotalAmountWithTax());
if (diff > 0.02) { // 允许0.02元的舍入误差
warnings.add(String.format(
"金额不一致:不含税金额+税额=%.2f,价税合计=%.2f,差额%.2f",
expectedTotal, invoice.getTotalAmountWithTax(), diff));
}
}
// 纳税人识别号格式验证
if (!isEmpty(invoice.getSellerTaxId()) &&
!invoice.getSellerTaxId().matches("[0-9A-Z]{15,20}")) {
warnings.add("销售方纳税人识别号格式可能有误: " + invoice.getSellerTaxId());
}
// 日期合理性
if (invoice.getInvoiceDate() != null) {
LocalDate invoiceDate = LocalDate.parse(invoice.getInvoiceDate());
if (invoiceDate.isAfter(LocalDate.now())) {
errors.add("开票日期不能是未来日期");
}
if (invoiceDate.isBefore(LocalDate.now().minusYears(5))) {
warnings.add("开票日期超过5年,请确认");
}
}
return new ValidationResult(errors.isEmpty(), errors, warnings);
}
private boolean isEmpty(String s) {
return s == null || s.trim().isEmpty();
}
public record ValidationResult(
boolean valid,
List<String> errors,
List<String> warnings
) {}
}完整处理流水线
/**
* 文档处理完整流水线
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class DocumentIntelligencePipeline {
private final DocumentPreprocessor preprocessor;
private final StructuredExtractorService extractor;
private final InvoiceValidator validator;
public InvoiceProcessResult processInvoice(byte[] fileContent, String fileName) {
try {
// 1. 预处理
log.info("开始处理发票: {}", fileName);
DocumentPreprocessor.ProcessedDocument doc =
preprocessor.process(fileContent, fileName);
// 2. 提取
InvoiceInfo invoice = extractor.extractInvoice(doc.getExtractedText());
// 3. 验证
InvoiceValidator.ValidationResult validation = validator.validate(invoice);
// 4. 汇总
InvoiceProcessResult result = InvoiceProcessResult.builder()
.fileName(fileName)
.invoice(invoice)
.validation(validation)
.ocrSource(doc.getSource())
.processedAt(LocalDateTime.now())
.build();
if (validation.valid()) {
log.info("发票{}处理成功: 金额={}元", fileName, invoice.getTotalAmountWithTax());
} else {
log.warn("发票{}验证失败: {}", fileName, validation.errors());
}
return result;
} catch (Exception e) {
log.error("发票{}处理异常: {}", fileName, e.getMessage(), e);
return InvoiceProcessResult.error(fileName, e.getMessage());
}
}
@Data @Builder
public static class InvoiceProcessResult {
private String fileName;
private InvoiceInfo invoice;
private InvoiceValidator.ValidationResult validation;
private DocumentPreprocessor.DocumentSource ocrSource;
private LocalDateTime processedAt;
private String errorMessage;
public static InvoiceProcessResult error(String fileName, String error) {
return InvoiceProcessResult.builder()
.fileName(fileName)
.errorMessage(error)
.processedAt(LocalDateTime.now())
.build();
}
}
}OCR+LLM的组合让文档信息提取的准确率比单独OCR高很多,主要原因是LLM能理解上下文——即使OCR识别了几个错字,LLM也能根据上下文纠正。
