第2063篇：文档智能——OCR+LLM实现合同/发票信息提取

老张2026/4/30大约 6 分钟

第2063篇：文档智能——OCR+LLM实现合同/发票信息提取

适读人群：需要处理非结构化文档的工程师 | 阅读时长：约19分钟 | 核心价值：掌握OCR+LLM的文档处理流水线，实现合同、发票、报表等文档的自动化信息提取

财务部门有个同事手工核对发票，每天要对几百张。每张发票要看：发票号码、开票日期、金额、税率、供应商信息。

我问他：这活儿为什么不自动化？他说：公司IT觉得发票识别准确率达不到要求，之前试过某OCR系统，识别后还是要人工复核。

这其实是用单独OCR解决不了的问题，但OCR+LLM组合可以解决——OCR负责"看"，LLM负责"理解和提取"。

文档处理流水线

第一步：文档预处理

/**
 * 文档预处理工具
 * 处理不同来源的文档，统一转为文本
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class DocumentPreprocessor {
    
    private final OcrService ocrService;
    
    /**
     * 统一的文档读取入口
     * 支持PDF、图片、扫描件
     */
    public ProcessedDocument process(byte[] fileContent, String fileName) {
        String extension = getExtension(fileName);
        
        return switch (extension.toLowerCase()) {
            case "pdf" -> processPdf(fileContent, fileName);
            case "jpg", "jpeg", "png", "bmp", "tiff" -> processImage(fileContent, fileName);
            case "docx" -> processDocx(fileContent, fileName);
            default -> throw new UnsupportedDocumentTypeException("不支持的文件类型: " + extension);
        };
    }
    
    private ProcessedDocument processPdf(byte[] content, String fileName) {
        // 判断是文字PDF还是扫描PDF
        try (PDDocument doc = PDDocument.load(content)) {
            PDFTextStripper stripper = new PDFTextStripper();
            String extractedText = stripper.getText(doc);
            
            // 如果提取到足够多的文字，说明是文字PDF
            if (extractedText.trim().length() > 100) {
                log.info("文字PDF: {}, 提取文字{}字符", fileName, extractedText.length());
                return ProcessedDocument.fromText(extractedText, fileName);
            }
            
            // 否则是扫描PDF，需要OCR
            log.info("扫描PDF: {}, 转换图片后OCR", fileName);
            return processScannedPdf(doc, fileName);
            
        } catch (IOException e) {
            throw new DocumentProcessException("PDF处理失败: " + fileName, e);
        }
    }
    
    private ProcessedDocument processScannedPdf(PDDocument doc, String fileName) {
        List<String> pageTexts = new ArrayList<>();
        
        PDFRenderer renderer = new PDFRenderer(doc);
        for (int page = 0; page < doc.getNumberOfPages(); page++) {
            try {
                // 渲染为300DPI图片（OCR推荐分辨率）
                BufferedImage image = renderer.renderImageWithDPI(page, 300);
                
                // 图像增强（提高OCR准确率）
                BufferedImage enhanced = enhanceImage(image);
                
                // OCR识别
                String pageText = ocrService.recognize(imageToBytes(enhanced));
                pageTexts.add(pageText);
                
                log.debug("第{}页OCR完成: {}字符", page + 1, pageText.length());
            } catch (Exception e) {
                log.warn("第{}页处理失败: {}", page + 1, e.getMessage());
                pageTexts.add("");
            }
        }
        
        String fullText = String.join("\n\n--- 页码分割 ---\n\n", pageTexts);
        return ProcessedDocument.fromText(fullText, fileName);
    }
    
    private BufferedImage enhanceImage(BufferedImage original) {
        // 灰度化
        BufferedImage gray = new BufferedImage(
            original.getWidth(), original.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
        Graphics g = gray.getGraphics();
        g.drawImage(original, 0, 0, null);
        g.dispose();
        
        return gray;
    }
    
    private String getExtension(String fileName) {
        int dotIndex = fileName.lastIndexOf('.');
        return dotIndex >= 0 ? fileName.substring(dotIndex + 1) : "";
    }
    
    @Data
    @Builder
    public static class ProcessedDocument {
        private String fileName;
        private String extractedText;
        private List<String> pages;
        private DocumentSource source;
        
        public static ProcessedDocument fromText(String text, String fileName) {
            return ProcessedDocument.builder()
                .fileName(fileName)
                .extractedText(text)
                .source(DocumentSource.TEXT_EXTRACTION)
                .build();
        }
    }
    
    public enum DocumentSource { TEXT_EXTRACTION, OCR }
}

第二步：LLM结构化提取

/**
 * 基于LLM的结构化信息提取
 * 支持发票、合同、报表等多种文档类型
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class StructuredExtractorService {
    
    private final ChatLanguageModel llm;
    private final ObjectMapper objectMapper;
    
    /**
     * 发票信息提取
     */
    public InvoiceInfo extractInvoice(String documentText) {
        String extractionPrompt = String.format("""
            请从以下发票文本中提取关键信息，输出严格的JSON格式。
            
            如果某个字段无法确定，填null。
            日期格式统一为yyyy-MM-dd。
            金额格式为数字（不含货币符号和千分位逗号）。
            
            发票文本：
            %s
            
            输出JSON（只输出JSON，不要其他内容）：
            {
              "invoiceCode": "发票代码",
              "invoiceNumber": "发票号码",
              "invoiceDate": "开票日期",
              "sellerName": "销售方名称",
              "sellerTaxId": "销售方纳税人识别号",
              "buyerName": "购买方名称",
              "buyerTaxId": "购买方纳税人识别号",
              "totalAmount": 不含税金额数字,
              "taxAmount": 税额数字,
              "totalAmountWithTax": 含税总额数字,
              "taxRate": 税率百分比数字如0.13,
              "items": [
                {
                  "name": "货物/服务名称",
                  "quantity": 数量数字,
                  "unit": "单位",
                  "unitPrice": 单价数字,
                  "amount": 金额数字
                }
              ]
            }
            """, truncateForPrompt(documentText));
        
        String response = llm.generate(extractionPrompt);
        return parseInvoiceInfo(response);
    }
    
    /**
     * 合同关键信息提取
     */
    public ContractInfo extractContract(String documentText) {
        String extractionPrompt = String.format("""
            请从以下合同文本中提取关键信息，输出JSON格式。
            
            合同文本：
            %s
            
            输出JSON：
            {
              "contractNumber": "合同编号",
              "contractName": "合同名称",
              "signDate": "签订日期",
              "effectiveDate": "生效日期",
              "expiryDate": "终止日期",
              "partyA": {
                "name": "甲方名称",
                "representative": "法定代表人",
                "address": "地址"
              },
              "partyB": {
                "name": "乙方名称",
                "representative": "法定代表人",
                "address": "地址"
              },
              "contractAmount": 合同金额数字或null,
              "currency": "货币",
              "paymentTerms": "付款方式描述",
              "deliveryTerms": "交付条款",
              "warrantyPeriod": "质保期",
              "penaltyClause": "违约责任关键条款",
              "arbitrationClause": "争议解决方式"
            }
            """, truncateForPrompt(documentText));
        
        String response = llm.generate(extractionPrompt);
        return parseContractInfo(response);
    }
    
    /**
     * 通用信息提取（自定义模板）
     */
    public <T> T extractWithTemplate(
            String documentText, 
            String extractionTemplate,  // JSON Schema描述
            Class<T> targetClass) {
        
        String prompt = String.format("""
            请从以下文档中提取信息，按照指定的JSON格式输出。
            只输出JSON，不要其他文字。
            
            文档内容：
            %s
            
            提取格式：
            %s
            """, truncateForPrompt(documentText), extractionTemplate);
        
        String response = llm.generate(prompt);
        
        try {
            String json = extractJsonFromResponse(response);
            return objectMapper.readValue(json, targetClass);
        } catch (Exception e) {
            log.error("信息提取解析失败: {}", e.getMessage());
            throw new ExtractionException("信息提取失败", e);
        }
    }
    
    private String truncateForPrompt(String text) {
        // LLM的上下文有限，超长文档需要截断
        int maxLength = 4000;
        if (text.length() <= maxLength) return text;
        
        // 截取前后部分，通常关键信息在头尾
        int halfLength = maxLength / 2;
        return text.substring(0, halfLength) + 
            "\n...[文档中间部分省略]...\n" +
            text.substring(text.length() - halfLength);
    }
    
    private String extractJsonFromResponse(String response) {
        // 处理LLM可能加的markdown代码块
        String cleaned = response.trim();
        if (cleaned.startsWith("```json")) {
            cleaned = cleaned.substring(7);
        } else if (cleaned.startsWith("```")) {
            cleaned = cleaned.substring(3);
        }
        if (cleaned.endsWith("```")) {
            cleaned = cleaned.substring(0, cleaned.length() - 3);
        }
        return cleaned.trim();
    }
    
    private InvoiceInfo parseInvoiceInfo(String json) {
        try {
            return objectMapper.readValue(extractJsonFromResponse(json), InvoiceInfo.class);
        } catch (Exception e) {
            log.error("发票信息解析失败: {}", e.getMessage());
            return new InvoiceInfo();  // 返回空对象，由业务层处理
        }
    }
    
    private ContractInfo parseContractInfo(String json) {
        try {
            return objectMapper.readValue(extractJsonFromResponse(json), ContractInfo.class);
        } catch (Exception e) {
            log.error("合同信息解析失败: {}", e.getMessage());
            return new ContractInfo();
        }
    }
}

第三步：结果验证

/**
 * 提取结果的业务规则验证
 */
@Component
public class InvoiceValidator {
    
    /**
     * 验证发票信息的完整性和合法性
     */
    public ValidationResult validate(InvoiceInfo invoice) {
        List<String> errors = new ArrayList<>();
        List<String> warnings = new ArrayList<>();
        
        // 必填字段检查
        if (isEmpty(invoice.getInvoiceNumber())) {
            errors.add("发票号码不能为空");
        }
        if (isEmpty(invoice.getSellerName())) {
            errors.add("销售方名称不能为空");
        }
        
        // 金额逻辑验证
        if (invoice.getTotalAmount() != null && invoice.getTaxAmount() != null 
                && invoice.getTotalAmountWithTax() != null) {
            double expectedTotal = invoice.getTotalAmount() + invoice.getTaxAmount();
            double diff = Math.abs(expectedTotal - invoice.getTotalAmountWithTax());
            if (diff > 0.02) {  // 允许0.02元的舍入误差
                warnings.add(String.format(
                    "金额不一致：不含税金额+税额=%.2f，价税合计=%.2f，差额%.2f",
                    expectedTotal, invoice.getTotalAmountWithTax(), diff));
            }
        }
        
        // 纳税人识别号格式验证
        if (!isEmpty(invoice.getSellerTaxId()) && 
                !invoice.getSellerTaxId().matches("[0-9A-Z]{15,20}")) {
            warnings.add("销售方纳税人识别号格式可能有误: " + invoice.getSellerTaxId());
        }
        
        // 日期合理性
        if (invoice.getInvoiceDate() != null) {
            LocalDate invoiceDate = LocalDate.parse(invoice.getInvoiceDate());
            if (invoiceDate.isAfter(LocalDate.now())) {
                errors.add("开票日期不能是未来日期");
            }
            if (invoiceDate.isBefore(LocalDate.now().minusYears(5))) {
                warnings.add("开票日期超过5年，请确认");
            }
        }
        
        return new ValidationResult(errors.isEmpty(), errors, warnings);
    }
    
    private boolean isEmpty(String s) {
        return s == null || s.trim().isEmpty();
    }
    
    public record ValidationResult(
        boolean valid,
        List<String> errors,
        List<String> warnings
    ) {}
}

完整处理流水线

/**
 * 文档处理完整流水线
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class DocumentIntelligencePipeline {
    
    private final DocumentPreprocessor preprocessor;
    private final StructuredExtractorService extractor;
    private final InvoiceValidator validator;
    
    public InvoiceProcessResult processInvoice(byte[] fileContent, String fileName) {
        try {
            // 1. 预处理
            log.info("开始处理发票: {}", fileName);
            DocumentPreprocessor.ProcessedDocument doc = 
                preprocessor.process(fileContent, fileName);
            
            // 2. 提取
            InvoiceInfo invoice = extractor.extractInvoice(doc.getExtractedText());
            
            // 3. 验证
            InvoiceValidator.ValidationResult validation = validator.validate(invoice);
            
            // 4. 汇总
            InvoiceProcessResult result = InvoiceProcessResult.builder()
                .fileName(fileName)
                .invoice(invoice)
                .validation(validation)
                .ocrSource(doc.getSource())
                .processedAt(LocalDateTime.now())
                .build();
            
            if (validation.valid()) {
                log.info("发票{}处理成功: 金额={}元", fileName, invoice.getTotalAmountWithTax());
            } else {
                log.warn("发票{}验证失败: {}", fileName, validation.errors());
            }
            
            return result;
            
        } catch (Exception e) {
            log.error("发票{}处理异常: {}", fileName, e.getMessage(), e);
            return InvoiceProcessResult.error(fileName, e.getMessage());
        }
    }
    
    @Data @Builder
    public static class InvoiceProcessResult {
        private String fileName;
        private InvoiceInfo invoice;
        private InvoiceValidator.ValidationResult validation;
        private DocumentPreprocessor.DocumentSource ocrSource;
        private LocalDateTime processedAt;
        private String errorMessage;
        
        public static InvoiceProcessResult error(String fileName, String error) {
            return InvoiceProcessResult.builder()
                .fileName(fileName)
                .errorMessage(error)
                .processedAt(LocalDateTime.now())
                .build();
        }
    }
}

OCR+LLM的组合让文档信息提取的准确率比单独OCR高很多，主要原因是LLM能理解上下文——即使OCR识别了几个错字，LLM也能根据上下文纠正。