Python 多模态 AI 实战——图像识别、语音转文字、视觉问答完整方案
Python 多模态 AI 实战——图像识别、语音转文字、视觉问答完整方案
适读人群:想在 Python 项目中集成图像和语音 AI 能力的工程师 | 阅读时长:约18分钟 | 核心价值:掌握多模态 AI 的工程实践,把图像识别、语音识别、视觉问答集成到实际项目中
去年,有个做质检系统的工程师小梁找到我。他们公司做工业生产质检,以前是人工盯着流水线看,现在老板要求用 AI 替代。他们有摄像头、有图片数据、也有语音指令系统,但不知道怎么把这些 AI 能力接入到自己的 Python 系统里。
他以为这很难——图像识别要训练模型,语音识别要买专业服务……
我说:现在不一样了。GPT-4o 直接支持图像输入,Whisper 做语音转文字准确率极高,这些都通过简单的 API 就能调用。你不需要自己训练模型,三天之内就能把多模态能力集成到你的质检系统里。
今天我们把三类多模态能力都讲一遍:图像识别(Vision)、语音转文字(Speech-to-Text)、以及视觉问答(Visual Q&A)。
一、图像识别与视觉理解
1.1 GPT-4o 视觉 API
import os
import base64
from openai import OpenAI
from pathlib import Path
client = OpenAI()
def encode_image_to_base64(image_path: str) -> str:
"""将本地图片编码为 base64"""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def analyze_image(
image_source: str, # 本地路径 or URL
question: str = "请详细描述这张图片的内容",
model: str = "gpt-4o",
max_tokens: int = 1000
) -> str:
"""
使用 GPT-4o 分析图片
支持本地文件和 URL 两种输入方式
"""
# 构建图片消息
if image_source.startswith("http"):
# URL 方式
image_content = {
"type": "image_url",
"image_url": {
"url": image_source,
"detail": "high" # "low", "high", "auto"
# high: 更精细,消耗更多 Token
# low: 快速,适合基础识别
}
}
else:
# 本地文件:转为 base64
image_data = encode_image_to_base64(image_source)
suffix = Path(image_source).suffix.lower()
mime_type = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp"
}.get(suffix, "image/jpeg")
image_content = {
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{image_data}",
"detail": "high"
}
}
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
image_content,
{"type": "text", "text": question}
]
}
],
max_tokens=max_tokens
)
return response.choices[0].message.content
# 使用示例
description = analyze_image(
"product_defect.jpg",
question="这个工业零件是否有缺陷?如果有,请描述缺陷的位置和类型。"
)
print(description)1.2 工业质检应用
import json
from typing import List, Dict
from pydantic import BaseModel
class QualityCheckResult(BaseModel):
has_defect: bool
defect_types: List[str]
severity: str # "none", "minor", "major", "critical"
defect_locations: List[str]
confidence: float
action: str # "pass", "rework", "reject"
notes: str
def industrial_quality_check(image_path: str) -> QualityCheckResult:
"""
工业质检:分析产品图片,返回结构化的质检结果
"""
QUALITY_CHECK_PROMPT = """你是一个专业的工业质检专家,擅长识别产品缺陷。
请分析这张产品图片,检查以下缺陷类型:
- 表面划痕或刮花
- 变形或尺寸不符
- 颜色异常(色差、变色)
- 组装错误(零件缺失、位置偏移)
- 污染(油污、灰尘、锈迹)
请按照以下 JSON 格式输出结果:
{
"has_defect": true/false,
"defect_types": ["缺陷类型1", ...],
"severity": "none/minor/major/critical",
"defect_locations": ["左上角", ...],
"confidence": 0.0-1.0,
"action": "pass/rework/reject",
"notes": "补充说明"
}
只输出 JSON,不要其他文字。"""
image_data = encode_image_to_base64(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}",
"detail": "high"
}
},
{"type": "text", "text": QUALITY_CHECK_PROMPT}
]
}
],
max_tokens=500,
temperature=0,
response_format={"type": "json_object"}
)
result_dict = json.loads(response.choices[0].message.content)
return QualityCheckResult(**result_dict)
# 批量质检
def batch_quality_check(image_paths: List[str]) -> Dict:
"""批量质检多张产品图片"""
results = []
pass_count = 0
for path in image_paths:
try:
result = industrial_quality_check(path)
results.append({
"image": path,
"result": result.dict(),
"status": "success"
})
if result.action == "pass":
pass_count += 1
except Exception as e:
results.append({"image": path, "status": "error", "error": str(e)})
return {
"total": len(image_paths),
"pass": pass_count,
"fail": len(image_paths) - pass_count,
"pass_rate": f"{pass_count/len(image_paths)*100:.1f}%",
"details": results
}二、语音转文字(Whisper API)
2.1 基础语音识别
import os
from openai import OpenAI
from pathlib import Path
import tempfile
client = OpenAI()
def transcribe_audio(
audio_path: str,
language: str = "zh", # 中文:zh,英文:en,自动检测:None
response_format: str = "json" # "text", "json", "srt", "vtt", "verbose_json"
) -> dict:
"""
使用 Whisper API 转录音频文件
支持 mp3, mp4, mpeg, mpga, m4a, wav, webm
文件大小限制:25MB
"""
with open(audio_path, "rb") as audio_file:
kwargs = {
"model": "whisper-1",
"file": audio_file,
"response_format": response_format
}
if language:
kwargs["language"] = language
transcript = client.audio.transcriptions.create(**kwargs)
if response_format == "text":
return {"text": transcript}
elif response_format == "verbose_json":
return {
"text": transcript.text,
"language": transcript.language,
"duration": transcript.duration,
"segments": [
{
"start": seg["start"],
"end": seg["end"],
"text": seg["text"]
}
for seg in transcript.segments
]
}
else:
return {"text": transcript.text}
# 实测性能参数:
# whisper-1 模型
# 1分钟音频 ≈ 4秒处理时间(服务端)
# 成本:$0.006/分钟
# 准确率:中文约 95%+,带口音约 88-92%
def transcribe_with_speaker_detection(audio_path: str) -> list:
"""
转录并尝试识别说话人(通过内容分析)
注意:Whisper 本身不支持说话人分离,需要后处理
"""
# 先获取带时间戳的详细转录
result = transcribe_audio(audio_path, response_format="verbose_json")
# 用 LLM 分析转录结果,识别说话人
segments_text = "\n".join([
f"[{seg['start']:.1f}s-{seg['end']:.1f}s]: {seg['text']}"
for seg in result.get("segments", [])
])
speaker_prompt = f"""以下是一段对话的转录(带时间戳)。
请分析说话人,用 JSON 格式返回每段话的说话人:
{segments_text}
输出格式:
[
{{"start": 0.0, "end": 3.2, "speaker": "A", "text": "..."}},
...
]"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": speaker_prompt}],
temperature=0,
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)2.2 实时语音处理(麦克风输入)
import pyaudio # pip install pyaudio
import wave
import io
import threading
def record_and_transcribe(duration: int = 5) -> str:
"""
录音并实时转录
duration: 录制秒数
"""
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
print(f"开始录音 {duration} 秒...")
p = pyaudio.PyAudio()
stream = p.open(
format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK
)
frames = []
for _ in range(0, int(RATE / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
print("录音完成,正在转录...")
# 将录音数据转为 WAV 格式(Whisper 需要)
buffer = io.BytesIO()
with wave.open(buffer, "wb") as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b"".join(frames))
buffer.seek(0)
buffer.name = "recording.wav" # 需要有文件名
# 调用 Whisper 转录
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=buffer,
language="zh"
)
return transcript.text三、多图片视觉问答
def visual_qa_multi_image(
images: List[str],
question: str
) -> str:
"""
多图片对比分析
最多支持 20 张图片(GPT-4o)
"""
content = []
for i, img_source in enumerate(images):
# 添加图片标签
content.append({
"type": "text",
"text": f"图片 {i+1}:"
})
if img_source.startswith("http"):
content.append({
"type": "image_url",
"image_url": {"url": img_source, "detail": "low"} # 多图用 low 节省 Token
})
else:
image_data = encode_image_to_base64(img_source)
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}",
"detail": "low"
}
})
content.append({"type": "text", "text": question})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=2000
)
return response.choices[0].message.content
# 使用:对比两个版本的 UI 截图
result = visual_qa_multi_image(
["ui_v1.png", "ui_v2.png"],
"对比这两个版本的 UI 界面,说明主要变化,并评价新版本是否有改善用户体验"
)
print(result)四、踩坑实录一:图片 Token 消耗超预期
现象:处理一批 100 张产品图片,Token 消耗比预计多了5倍,费用超支。
原因:GPT-4o 处理图片时,detail: "high" 模式会把大图分割成多个 512×512 的块分别处理,每块消耗 170 个 Token。一张 2048×2048 的高清图可能消耗 1105 个 Token(仅图片部分)。
解法:
# 估算图片 Token 消耗
def estimate_image_tokens(width: int, height: int, detail: str = "high") -> int:
if detail == "low":
return 85 # 固定 85 tokens
# high 模式:按 512px 块计算
# 先缩放到最长边不超过 2048px
max_size = 2048
if max(width, height) > max_size:
scale = max_size / max(width, height)
width, height = int(width * scale), int(height * scale)
# 按 512px 块切割
tiles_w = (width + 511) // 512
tiles_h = (height + 511) // 512
tiles = tiles_w * tiles_h
return 170 * tiles + 85
# 批量处理时用 low 模式降低成本
def cost_aware_analyze(image_path: str, budget_tokens: int = 200) -> str:
"""根据 Token 预算自动选择 detail 模式"""
from PIL import Image
img = Image.open(image_path)
w, h = img.size
high_tokens = estimate_image_tokens(w, h, "high")
detail = "high" if high_tokens <= budget_tokens else "low"
print(f"图片 {w}×{h},使用 {detail} 模式(约 {estimate_image_tokens(w, h, detail)} tokens)")
return analyze_image(image_path, detail=detail)五、踩坑实录二:Whisper 转录准确率低
现象:语音转录结果出现大量错误,特别是专有名词、技术术语被转录成谐音字。
原因:Whisper 对通用词汇准确率高,但对行业专有名词(如产品型号、技术术语)准确率较低。
解法:
def transcribe_with_vocabulary(
audio_path: str,
vocabulary: List[str] # 专有名词词汇表
) -> str:
"""
带自定义词汇的转录
通过 prompt 参数提示 Whisper 注意特定词汇
"""
vocab_hint = "以下是对话中可能出现的专有名词:" + "、".join(vocabulary)
with open(audio_path, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language="zh",
prompt=vocab_hint # Whisper 会参考 prompt 提高特定词的识别率
)
return transcript.text
# 使用
result = transcribe_with_vocabulary(
"meeting_record.mp3",
vocabulary=["LangChain", "RAG", "向量数据库", "Transformer", "微调"]
)六、踩坑实录三:本地视觉模型准确率差
现象:为节省成本用本地开源视觉模型(如 LLaVA)做质检,但准确率远低于 GPT-4o,漏检率高。
原因:开源视觉模型在特定工业场景下的准确率确实不如 GPT-4o,特别是细节识别(划痕、色差等)。
解法:
# 分级处理:先用本地小模型快速筛选,疑似有问题的再调用 GPT-4o 精检
def tiered_quality_check(image_path: str) -> dict:
"""
两级质检:本地模型粗筛 + GPT-4o 精检
降低 GPT-4o 调用量(只有10-20% 的图片需要精检)
"""
# 第一级:本地 Ollama + LLaVA 快速粗筛
import httpx
response = httpx.post("http://localhost:11434/api/generate", json={
"model": "llava:7b",
"prompt": f"这张产品图片有明显缺陷吗?只回答 yes 或 no。",
"images": [encode_image_to_base64(image_path)],
"stream": False
})
has_obvious_defect = "yes" in response.json()["response"].lower()
if has_obvious_defect:
# 第二级:GPT-4o 精确分析(只在有疑问时调用)
print("本地模型发现疑似缺陷,使用 GPT-4o 精检...")
return industrial_quality_check(image_path).dict()
else:
return {"has_defect": False, "action": "pass", "confidence": 0.85}多模态 AI 的能力现在已经非常强大,并且使用门槛极低。图像识别用 GPT-4o Vision,语音识别用 Whisper,这两个工具覆盖了 90% 的多模态应用场景。关键是找到合适的业务场景,设计合理的 Prompt,处理好成本和精度的权衡。
