Python Pydantic v2 深度实战——数据验证、序列化、模型继承完整指南
2026/4/30大约 7 分钟
Python Pydantic v2 深度实战——数据验证、序列化、模型继承完整指南
适读人群:正在用或准备用 Pydantic v2 的 Python 工程师,特别是做 FastAPI 开发的 | 阅读时长:约 17 分钟 | 核心价值:掌握 Pydantic v2 全部核心特性,写出类型安全、验证严格的 Python 代码
从一次数据脏入说起
小阳是做 AI 平台的,有个接口接收用户提交的配置参数,包括模型名称、温度值、最大 token 数等。上线第一周,就有用户传了个 temperature: "hot"(字符串!),导致下游调用崩溃。
再后来有人传了 max_tokens: -1,负数传进了 LLM API,报了一堆神奇的错误,排查了一个下午。
他用的是手写的 if/else 做参数校验,维护成本极高,而且总有遗漏。我建议他换成 Pydantic v2,把所有校验逻辑集中在模型定义里。
那次迁移之后,参数类型问题直接归零——Pydantic 在数据进入系统之前就把不合规的请求挡回去了,错误信息还非常友好。
一、Pydantic v2 vs v1:核心变化
Pydantic v2 在 2023 年发布,底层用 Rust 重写,性能提升 5-50 倍。API 有不少变化,从 v1 迁移需要注意。
| 特性 | v1 | v2 |
|---|---|---|
| 性能 | 基准 | 快 5-50 倍 |
| 校验器 | @validator | @field_validator |
| 模型配置 | class Config | model_config = ConfigDict(...) |
| JSON 序列化 | .json() | .model_dump_json() |
| 字典转换 | .dict() | .model_dump() |
| 构造 | __init__ | model_validate() |
二、模型定义:从简单到复杂
2.1 基础字段与类型
from pydantic import BaseModel, Field, field_validator, model_validator
from typing import Annotated, Optional
from datetime import datetime
from enum import Enum
class ModelType(str, Enum):
GPT4 = "gpt-4"
GPT35 = "gpt-3.5-turbo"
CLAUDE = "claude-3-opus"
class LLMConfig(BaseModel):
model: ModelType = ModelType.GPT35
temperature: float = Field(default=0.7, ge=0.0, le=2.0, description="采样温度")
max_tokens: int = Field(default=1000, ge=1, le=8192, description="最大 token 数")
top_p: float = Field(default=1.0, ge=0.0, le=1.0)
stop: list[str] = Field(default_factory=list)
system_prompt: Optional[str] = Field(default=None, max_length=4096)Field 里的 ge(大于等于)、le(小于等于)、max_length 等约束直接在运行时验证,不需要手写 if 判断。
2.2 自定义字段校验器
from pydantic import field_validator, ValidationInfo
class UserRegistration(BaseModel):
username: str = Field(min_length=3, max_length=50)
email: str
password: str = Field(min_length=8)
confirm_password: str
age: Optional[int] = Field(default=None, ge=0, le=150)
@field_validator("email")
@classmethod
def validate_email(cls, v: str) -> str:
"""自定义邮箱验证"""
if "@" not in v or "." not in v.split("@")[-1]:
raise ValueError("邮箱格式不正确")
return v.lower().strip()
@field_validator("username")
@classmethod
def validate_username(cls, v: str) -> str:
"""用户名只能包含字母、数字、下划线"""
import re
if not re.match(r"^[a-zA-Z0-9_]+$", v):
raise ValueError("用户名只能包含字母、数字和下划线")
return v
@model_validator(mode="after")
def check_passwords_match(self) -> "UserRegistration":
"""跨字段验证:密码和确认密码必须一致"""
if self.password != self.confirm_password:
raise ValueError("两次输入的密码不一致")
return self2.3 Annotated 类型:复用验证逻辑
from typing import Annotated
from pydantic import StringConstraints
# 定义可复用的类型
PositiveFloat = Annotated[float, Field(gt=0)]
NonEmptyStr = Annotated[str, StringConstraints(min_length=1, strip_whitespace=True)]
EmailStr = Annotated[str, Field(pattern=r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$")]
class Product(BaseModel):
name: NonEmptyStr
price: PositiveFloat
discount_price: Optional[PositiveFloat] = None
contact_email: EmailStr三、序列化:model_dump 与 model_dump_json
from pydantic import BaseModel, Field
from datetime import datetime
class Article(BaseModel):
id: int
title: str
content: str
created_at: datetime = Field(default_factory=datetime.now)
is_published: bool = False
tags: list[str] = Field(default_factory=list)
article = Article(id=1, title="Python 进阶", content="内容...")
# 基础序列化
d = article.model_dump()
# {'id': 1, 'title': 'Python 进阶', ...}
# 排除字段
d_no_content = article.model_dump(exclude={"content"})
# 只包含指定字段
d_summary = article.model_dump(include={"id", "title", "is_published"})
# 排除 None 值(API 响应常用)
d_no_none = article.model_dump(exclude_none=True)
# 排除默认值
d_no_default = article.model_dump(exclude_defaults=True)
# JSON 序列化(datetime 自动转 ISO 格式字符串)
json_str = article.model_dump_json()
json_str_indent = article.model_dump_json(indent=2)3.1 自定义序列化
from pydantic import field_serializer
class SensitiveUser(BaseModel):
id: int
name: str
email: str
password_hash: str # 序列化时不暴露
@field_serializer("password_hash")
def serialize_password(self, value: str, _info) -> str:
return "***" # 脱敏
@field_serializer("email")
def serialize_email(self, value: str, _info) -> str:
parts = value.split("@")
return f"{parts[0][:2]}***@{parts[1]}"
user = SensitiveUser(id=1, name="老张", email="laoz@example.com", password_hash="abc123")
print(user.model_dump())
# {'id': 1, 'name': '老张', 'email': 'la***@example.com', 'password_hash': '***'}四、模型继承与组合
4.1 继承共享基础字段
from pydantic import BaseModel, ConfigDict
from datetime import datetime
class TimestampMixin(BaseModel):
"""可复用的时间戳基类"""
created_at: datetime = Field(default_factory=datetime.now)
updated_at: Optional[datetime] = None
class IdentifiedMixin(BaseModel):
"""可复用的 ID 基类"""
id: int
# 请求模型(不含 id 和时间戳)
class ArticleCreate(BaseModel):
title: str = Field(min_length=1, max_length=200)
content: str = Field(min_length=1)
tags: list[str] = Field(default_factory=list)
# 响应模型(含 id 和时间戳)
class ArticleResponse(ArticleCreate, IdentifiedMixin, TimestampMixin):
model_config = ConfigDict(from_attributes=True) # 允许从 ORM 对象构建
# 更新模型(所有字段可选)
class ArticleUpdate(BaseModel):
title: Optional[str] = Field(default=None, min_length=1, max_length=200)
content: Optional[str] = Field(default=None, min_length=1)
tags: Optional[list[str]] = None4.2 model_config 配置
from pydantic import ConfigDict
class APIResponse(BaseModel):
model_config = ConfigDict(
from_attributes=True, # 允许从 ORM 对象、命名元组构建
populate_by_name=True, # 允许用字段名(而不只是别名)赋值
str_strip_whitespace=True, # 字符串自动去掉首尾空格
validate_assignment=True, # 赋值时也进行校验
frozen=False, # 允许修改(True = 不可变,可哈希)
extra="ignore", # 忽略额外字段("forbid" = 报错,"allow" = 保留)
)五、完整可运行示例
#!/usr/bin/env python3
"""
Pydantic v2 完整实战示例:LLM 调用参数验证
"""
from __future__ import annotations
import json
from datetime import datetime
from enum import Enum
from typing import Annotated, Any, Optional
from pydantic import (
BaseModel,
ConfigDict,
Field,
field_serializer,
field_validator,
model_validator,
)
# ===== 枚举 =====
class LLMProvider(str, Enum):
OPENAI = "openai"
ANTHROPIC = "anthropic"
LOCAL = "local"
class MessageRole(str, Enum):
SYSTEM = "system"
USER = "user"
ASSISTANT = "assistant"
# ===== 自定义类型 =====
PositiveInt = Annotated[int, Field(gt=0)]
ClampedFloat = Annotated[float, Field(ge=0.0, le=2.0)]
# ===== 嵌套模型 =====
class Message(BaseModel):
role: MessageRole
content: str = Field(min_length=1, max_length=100_000)
@field_validator("content")
@classmethod
def strip_content(cls, v: str) -> str:
return v.strip()
# ===== 主请求模型 =====
class LLMRequest(BaseModel):
model_config = ConfigDict(
str_strip_whitespace=True,
validate_assignment=True,
extra="forbid",
)
provider: LLMProvider = LLMProvider.OPENAI
model_name: str = Field(default="gpt-4o", min_length=1)
messages: list[Message] = Field(min_length=1)
temperature: ClampedFloat = 0.7
max_tokens: PositiveInt = 1000
stream: bool = False
metadata: dict[str, Any] = Field(default_factory=dict)
request_id: Optional[str] = None
@field_validator("messages")
@classmethod
def validate_messages(cls, v: list[Message]) -> list[Message]:
if not any(m.role == MessageRole.USER for m in v):
raise ValueError("消息列表中必须至少包含一条用户消息")
return v
@model_validator(mode="after")
def validate_local_model(self) -> LLMRequest:
if self.provider == LLMProvider.LOCAL and self.stream:
raise ValueError("本地模型暂不支持流式输出")
return self
@field_serializer("provider", "messages")
def serialize_enum_fields(self, v, _info):
if isinstance(v, Enum):
return v.value
if isinstance(v, list):
return [m.model_dump() for m in v]
return v
# ===== 响应模型 =====
class LLMResponse(BaseModel):
request_id: str
model_name: str
content: str
tokens_used: int
created_at: datetime = Field(default_factory=datetime.now)
cost_usd: Optional[float] = None
@field_serializer("created_at")
def serialize_datetime(self, v: datetime, _info) -> str:
return v.isoformat()
@field_serializer("cost_usd")
def serialize_cost(self, v: Optional[float], _info) -> str | None:
if v is None:
return None
return f"${v:.6f}"
def simulate_llm_call(request: LLMRequest) -> LLMResponse:
"""模拟 LLM 调用"""
last_user_msg = next(m for m in reversed(request.messages) if m.role == MessageRole.USER)
return LLMResponse(
request_id=request.request_id or "req-001",
model_name=request.model_name,
content=f"这是对 '{last_user_msg.content[:20]}...' 的回答",
tokens_used=len(last_user_msg.content.split()) * 2,
cost_usd=0.00015,
)
def main():
print("=== Pydantic v2 LLM 请求验证演示 ===\n")
# 正常请求
try:
req = LLMRequest(
provider="openai",
model_name="gpt-4o",
messages=[
{"role": "system", "content": "你是一个专业的代码审查助手"},
{"role": "user", "content": "帮我审查这段 Python 代码"},
],
temperature=0.5,
max_tokens=2000,
)
print("请求验证通过:")
print(req.model_dump_json(indent=2))
resp = simulate_llm_call(req)
print("\nLLM 响应:")
print(resp.model_dump_json(indent=2))
except Exception as e:
print(f"意外错误: {e}")
# 测试各种验证错误
test_cases = [
({"messages": [{"role": "user", "content": "hello"}], "temperature": 3.0}, "温度超范围"),
({"messages": [{"role": "system", "content": "system only"}]}, "缺少用户消息"),
({"messages": [{"role": "user", "content": "test"}], "max_tokens": -1}, "负数 max_tokens"),
({"messages": [], "temperature": 0.5}, "空消息列表"),
]
print("\n=== 验证错误测试 ===")
for params, desc in test_cases:
try:
LLMRequest(**params)
print(f" {desc}: 意外通过(BUG!)")
except Exception as e:
errors = json.loads(e.json()) if hasattr(e, "json") else [{"msg": str(e)}]
first_error = errors[0].get("msg", str(e)) if errors else str(e)
print(f" {desc}: 正确拦截 - {first_error[:60]}")
if __name__ == "__main__":
main()六、踩坑实录 1:v1 的 @validator 在 v2 中被废弃
# v1 写法(v2 中仍然能用,但会有 DeprecationWarning)
from pydantic import validator
class MyModel(BaseModel):
name: str
@validator("name")
def validate_name(cls, v):
return v.strip()
# v2 正确写法
from pydantic import field_validator
class MyModel(BaseModel):
name: str
@field_validator("name")
@classmethod
def validate_name(cls, v: str) -> str:
return v.strip()七、踩坑实录 2:model_validate 和构造函数的区别
# 两种构建方式的区别
class User(BaseModel):
id: int
name: str
# 方式1:直接构造(传入原始数据,会做验证)
user1 = User(id="1", name="老张") # id 会从字符串转成 int
user2 = User.model_validate({"id": "1", "name": "老张"}) # 等价
# 方式2:从 ORM 对象构建(需要 from_attributes=True)
class UserModel(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
name: str
# orm_obj.id = 1, orm_obj.name = "老张"
user = UserModel.model_validate(orm_obj) # 从 ORM 对象属性读取八、踩坑实录 3:嵌套模型的验证错误信息
from pydantic import ValidationError
try:
LLMRequest(
messages=[
{"role": "user", "content": ""}, # 空内容!
{"role": "invalid_role", "content": "hello"}, # 无效角色!
]
)
except ValidationError as e:
print(e)
# 输出详细的嵌套错误信息,包括哪个字段、哪一条、什么错误
for error in e.errors():
print(f"字段路径: {error['loc']}")
print(f"错误信息: {error['msg']}")
print(f"错误类型: {error['type']}")Pydantic v2 的错误信息非常详细,字段路径包含嵌套层级(如 ('messages', 0, 'content')),非常利于调试。
总结
Pydantic v2 的核心使用要点:
Field里直接声明约束(ge/le/min_length等),不需要手写 if@field_validator处理单字段自定义逻辑,@model_validator处理跨字段校验model_dump()替代.dict(),支持exclude/include/exclude_nonemodel_config = ConfigDict(...)替代内部class Configfrom_attributes=True让模型可以从 ORM 对象构建
