多模态AI应用开发：图文理解与生成Spring AI完整实现

老张2026/4/30大约 9 分钟

多模态AI应用开发：图文理解与生成Spring AI完整实现

适读人群：有1-5年Java开发经验，想向AI工程师方向转型的开发者 阅读时长：约18分钟 文章价值：
掌握Spring AI处理图片、文档等多模态输入的完整方案
学会实现图文理解、图表分析、文档解析等常见多模态场景
了解多模态AI应用的架构设计和工程实践

一张截图引发的需求

老方是一家SaaS公司的产品经理，他们的产品是面向中小企业的财务助手。

用户一直在反馈一个需求："能不能直接发给你一张报销单的照片，你帮我识别出金额和类别，自动录入？"

老方把这个需求丢给了技术团队。研发同学小周说："用OCR可以，但只能提取文字，理解上下文还是要人工。"

然后产品一直搁置着。

直到小周发现GPT-4o和Claude 3.5已经可以直接看图、理解图片内容了，他用了两天时间，直接实现了：用户拍张发票照片，AI自动识别金额、日期、类别，直接出现在录入表单里，准确率超过95%。

这就是多模态AI的魅力——让AI直接"看"图片，而不是先OCR再NLP再理解，中间少了太多损耗。

今天我来讲Spring AI的多模态开发，重点是图文理解这个最实用的场景。

多模态AI的能力全景

Spring AI多模态架构

Maven依赖配置

<dependencies>
    <!-- Spring AI OpenAI（支持GPT-4o多模态） -->
    <dependency>
        <groupId>org.springframework.ai</groupId>
        <artifactId>spring-ai-openai-spring-boot-starter</artifactId>
        <version>1.0.0</version>
    </dependency>
    
    <!-- 文件上传 -->
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    
    <!-- 图片处理（压缩/格式转换） -->
    <dependency>
        <groupId>net.coobird</groupId>
        <artifactId>thumbnailator</artifactId>
        <version>0.4.20</version>
    </dependency>
    
    <!-- PDF处理 -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>3.0.2</version>
    </dependency>
</dependencies>

核心实现一：图片理解服务

package com.laozhang.ai.multimodal;

import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.chat.client.ChatClient;
import org.springframework.ai.chat.messages.UserMessage;
import org.springframework.ai.chat.model.ChatResponse;
import org.springframework.ai.chat.prompt.Prompt;
import org.springframework.ai.model.Media;
import org.springframework.stereotype.Service;
import org.springframework.util.MimeTypeUtils;
import org.springframework.web.multipart.MultipartFile;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;

/**
 * 图片理解服务
 * 支持票据识别、图表分析、界面理解等场景
 */
@Service
@Slf4j
@RequiredArgsConstructor
public class ImageUnderstandingService {

    private final ChatClient chatClient;

    // 上传图片的最大大小：4MB（GPT-4o限制20MB，适当压缩节省Token）
    private static final int MAX_IMAGE_SIZE_BYTES = 4 * 1024 * 1024;
    // 长边最大像素（超过则压缩）
    private static final int MAX_DIMENSION = 2048;

    /**
     * 通用图片问答
     * 用户上传图片 + 提出问题，AI直接看图回答
     */
    public String askAboutImage(MultipartFile imageFile, String question) throws IOException {
        // 预处理图片（压缩+格式标准化）
        byte[] imageBytes = preprocessImage(imageFile);
        String mimeType = "image/jpeg"; // 统一转为JPEG
        
        // 构建包含图片的消息
        Media imageMedia = new Media(
            MimeTypeUtils.parseMimeType(mimeType),
            imageBytes
        );
        
        UserMessage userMessage = new UserMessage(question, List.of(imageMedia));
        
        ChatResponse response = chatClient.prompt()
            .messages(userMessage)
            .call()
            .chatResponse();
        
        return response.getResult().getOutput().getContent();
    }

    /**
     * 发票/票据识别
     * 专门针对财务报销场景优化的Prompt
     */
    public InvoiceInfo recognizeInvoice(MultipartFile imageFile) throws IOException {
        String systemPrompt = """
            你是一个专业的财务票据识别助手。
            请从图片中提取票据信息，严格按照JSON格式返回，不要有任何额外说明。
            如果某个字段无法识别，返回null。
            """;
        
        String userPrompt = """
            请识别这张票据，返回如下JSON格式：
            {
                "invoiceType": "发票类型（增值税发票/普通发票/收据等）",
                "invoiceNumber": "发票号码",
                "date": "开票日期（格式YYYY-MM-DD）",
                "amount": 金额数字（不含单位）,
                "taxAmount": 税额数字,
                "totalAmount": 价税合计数字,
                "seller": "销售方名称",
                "buyer": "购买方名称",
                "itemCategory": "商品/服务类别",
                "itemDescription": "商品/服务描述"
            }
            """;
        
        byte[] imageBytes = preprocessImage(imageFile);
        Media imageMedia = new Media(
            MimeTypeUtils.parseMimeType("image/jpeg"),
            imageBytes
        );
        
        UserMessage userMessage = new UserMessage(userPrompt, List.of(imageMedia));
        
        String jsonResult = chatClient.prompt()
            .system(systemPrompt)
            .messages(userMessage)
            .call()
            .content();
        
        // 解析JSON结果
        return parseInvoiceJson(jsonResult);
    }

    /**
     * 图表数据提取
     * 从柱状图、折线图、饼图中提取数据
     */
    public ChartData extractChartData(MultipartFile chartImage) throws IOException {
        String prompt = """
            请分析这张图表并提取数据：
            1. 图表类型（柱状图/折线图/饼图/散点图等）
            2. 标题
            3. X轴标签（如果有）
            4. Y轴标签和单位（如果有）
            5. 所有数据点/数据系列
            6. 关键趋势或洞察
            
            请按以下JSON格式返回：
            {
                "chartType": "图表类型",
                "title": "图表标题",
                "xAxisLabel": "X轴标签",
                "yAxisLabel": "Y轴标签（含单位）",
                "dataSeries": [
                    {
                        "name": "系列名称",
                        "data": [{"label": "标签", "value": 数值}]
                    }
                ],
                "insights": ["关键洞察1", "关键洞察2"]
            }
            """;
        
        byte[] imageBytes = preprocessImage(chartImage);
        Media imageMedia = new Media(
            MimeTypeUtils.parseMimeType("image/jpeg"),
            imageBytes
        );
        
        UserMessage message = new UserMessage(prompt, List.of(imageMedia));
        String result = chatClient.prompt()
            .system("你是一个专业的数据分析师，擅长从图表中准确提取数据。")
            .messages(message)
            .call()
            .content();
        
        return parseChartJson(result);
    }

    /**
     * UI/截图分析
     * 分析界面截图，生成描述或测试用例
     */
    public UIAnalysisResult analyzeUIScreenshot(MultipartFile screenshot, 
                                                  String analysisType) throws IOException {
        String prompt = switch (analysisType) {
            case "describe" -> "请详细描述这个界面的布局、功能区域和主要交互元素。";
            case "bug-report" -> """
                请分析这个界面截图，找出可能的UI/UX问题：
                1. 布局问题
                2. 文字截断或溢出
                3. 对齐问题
                4. 可访问性问题
                5. 其他视觉缺陷
                """;
            case "test-cases" -> """
                基于这个界面截图，生成功能测试用例（Markdown格式）：
                1. 正向流程测试
                2. 边界条件测试
                3. 异常场景测试
                """;
            default -> "请描述这张图片的内容。";
        };
        
        byte[] imageBytes = preprocessImage(screenshot);
        Media imageMedia = new Media(
            MimeTypeUtils.parseMimeType("image/jpeg"),
            imageBytes
        );
        
        UserMessage message = new UserMessage(prompt, List.of(imageMedia));
        String result = chatClient.prompt()
            .messages(message)
            .call()
            .content();
        
        return new UIAnalysisResult(analysisType, result);
    }

    /**
     * 图片预处理：压缩、格式转换
     * 减少API调用的Token消耗（Vision Token按像素计费）
     */
    private byte[] preprocessImage(MultipartFile file) throws IOException {
        BufferedImage original = ImageIO.read(file.getInputStream());
        
        int width = original.getWidth();
        int height = original.getHeight();
        
        // 如果图片超过最大尺寸，按比例缩放
        if (width > MAX_DIMENSION || height > MAX_DIMENSION) {
            double scale = Math.min(
                (double) MAX_DIMENSION / width,
                (double) MAX_DIMENSION / height
            );
            width = (int) (width * scale);
            height = (int) (height * scale);
            
            log.debug("图片压缩: {}x{} -> {}x{}", 
                original.getWidth(), original.getHeight(), width, height);
        }
        
        // 转为JPEG输出（减少体积，统一格式）
        ByteArrayOutputStream output = new ByteArrayOutputStream();
        
        // 使用Thumbnailator进行高质量压缩
        net.coobird.thumbnailator.Thumbnails.of(original)
            .size(width, height)
            .outputFormat("JPEG")
            .outputQuality(0.85)  // 85%质量，平衡清晰度和体积
            .toOutputStream(output);
        
        byte[] compressed = output.toByteArray();
        log.debug("图片大小: {}KB -> {}KB", 
            file.getSize() / 1024, compressed.length / 1024);
        
        return compressed;
    }

    private InvoiceInfo parseInvoiceJson(String json) {
        // 使用Jackson解析，省略具体实现
        try {
            // 提取JSON部分（LLM可能会在JSON前后加一些文字）
            int start = json.indexOf('{');
            int end = json.lastIndexOf('}') + 1;
            if (start >= 0 && end > start) {
                json = json.substring(start, end);
            }
            return new com.fasterxml.jackson.databind.ObjectMapper()
                .readValue(json, InvoiceInfo.class);
        } catch (Exception e) {
            log.error("解析发票JSON失败: {}", json, e);
            return new InvoiceInfo(null, null, null, null, null, null, null, null, null, null);
        }
    }

    private ChartData parseChartJson(String json) {
        // 类似parseInvoiceJson，省略
        return null;
    }

    // 数据模型
    public record InvoiceInfo(
        String invoiceType,
        String invoiceNumber,
        String date,
        Double amount,
        Double taxAmount,
        Double totalAmount,
        String seller,
        String buyer,
        String itemCategory,
        String itemDescription
    ) {}
    
    public record ChartData(
        String chartType,
        String title,
        String xAxisLabel,
        String yAxisLabel,
        Object dataSeries,
        List<String> insights
    ) {}
    
    public record UIAnalysisResult(String analysisType, String result) {}
}

核心实现二：多图对比分析

/**
 * 多图对比服务
 * 支持将多张图片同时发给LLM进行对比分析
 */
@Service
@RequiredArgsConstructor
public class MultiImageComparisonService {

    private final ChatClient chatClient;
    private final ImageUnderstandingService imageProcessor;

    /**
     * 对比两张图片的差异
     * 场景：UI对比、产品对比、版本差异等
     */
    public String compareImages(MultipartFile image1, MultipartFile image2,
                                 String comparisonType) throws IOException {
        byte[] bytes1 = imageProcessor.preprocessImage(image1);
        byte[] bytes2 = imageProcessor.preprocessImage(image2);
        
        Media media1 = new Media(MimeTypeUtils.IMAGE_JPEG, bytes1);
        Media media2 = new Media(MimeTypeUtils.IMAGE_JPEG, bytes2);
        
        String prompt = switch (comparisonType) {
            case "ui-diff" -> """
                请对比这两张UI截图，找出所有差异：
                1. 新增的元素
                2. 删除的元素
                3. 修改的内容（文字、颜色、位置等）
                4. 布局变化
                按差异类别列出。
                """;
            case "product" -> """
                请对比这两款产品的图片，从以下维度分析：
                外观差异、功能差异、设计风格差异。
                """;
            default -> "请详细对比这两张图片的异同。";
        };
        
        // 一条Message里包含两张图片
        UserMessage message = new UserMessage(prompt, List.of(media1, media2));
        
        return chatClient.prompt()
            .messages(message)
            .call()
            .content();
    }

    /**
     * 图片系列分析（分析一组截图的使用流程）
     */
    public String analyzeImageSequence(List<MultipartFile> images, 
                                        String context) throws IOException {
        List<Media> mediaList = new ArrayList<>();
        for (int i = 0; i < images.size(); i++) {
            byte[] bytes = imageProcessor.preprocessImage(images.get(i));
            mediaList.add(new Media(MimeTypeUtils.IMAGE_JPEG, bytes));
        }
        
        String prompt = String.format("""
            以下是%d张按顺序排列的截图（图1至图%d）。
            背景信息：%s
            
            请分析这一系列截图展示的流程：
            1. 每个步骤的操作
            2. 整体流程逻辑
            3. 潜在的用户体验问题
            """, images.size(), images.size(), context);
        
        UserMessage message = new UserMessage(prompt, mediaList);
        
        return chatClient.prompt()
            .system("你是一个UX分析专家和流程优化顾问。")
            .messages(message)
            .call()
            .content();
    }
}

API接口实现

/**
 * 多模态AI接口
 */
@RestController
@RequestMapping("/api/multimodal")
@RequiredArgsConstructor
@Slf4j
public class MultimodalController {

    private final ImageUnderstandingService imageService;
    private final MultiImageComparisonService comparisonService;

    /**
     * 发票识别
     */
    @PostMapping("/invoice/recognize")
    public ResponseEntity<InvoiceInfo> recognizeInvoice(
        @RequestParam("file") MultipartFile file
    ) throws IOException {
        log.info("收到发票识别请求，文件大小: {}KB", file.getSize() / 1024);
        
        InvoiceInfo result = imageService.recognizeInvoice(file);
        return ResponseEntity.ok(result);
    }

    /**
     * 图表数据提取
     */
    @PostMapping("/chart/extract")
    public ResponseEntity<ChartData> extractChartData(
        @RequestParam("file") MultipartFile file
    ) throws IOException {
        ChartData result = imageService.extractChartData(file);
        return ResponseEntity.ok(result);
    }

    /**
     * 通用图片问答
     */
    @PostMapping("/image/ask")
    public ResponseEntity<Map<String, String>> askAboutImage(
        @RequestParam("file") MultipartFile file,
        @RequestParam("question") String question
    ) throws IOException {
        String answer = imageService.askAboutImage(file, question);
        return ResponseEntity.ok(Map.of("answer", answer));
    }

    /**
     * UI截图分析
     */
    @PostMapping("/ui/analyze")
    public ResponseEntity<UIAnalysisResult> analyzeUI(
        @RequestParam("file") MultipartFile file,
        @RequestParam(value = "type", defaultValue = "describe") String type
    ) throws IOException {
        UIAnalysisResult result = imageService.analyzeUIScreenshot(file, type);
        return ResponseEntity.ok(result);
    }

    /**
     * 双图对比
     */
    @PostMapping("/compare")
    public ResponseEntity<Map<String, String>> compareImages(
        @RequestParam("image1") MultipartFile image1,
        @RequestParam("image2") MultipartFile image2,
        @RequestParam(value = "type", defaultValue = "general") String type
    ) throws IOException {
        String result = comparisonService.compareImages(image1, image2, type);
        return ResponseEntity.ok(Map.of("result", result));
    }
}

多模态AI的成本注意事项

场景	图片大小	Vision Token消耗	建议
低分辨率文档（768×768以下）	~1MB	~85 tokens	直接发送
高分辨率截图（1920×1080）	~3MB	~1700 tokens	缩放到1024px
大量图表	~5MB	~3000+ tokens	压缩到1024px，质量0.8
批量发票处理	多张	累计较高	考虑本地OCR预处理

关键建议：Vision API的Token按图片分辨率计费（GPT-4o low quality模式约85 tokens，high quality模式按tile计算）。图片能压缩就压缩，不要发原图。