第2443篇:AI能力的服务化与开放——对内共享和对外输出AI能力的工程方案
2026/4/30大约 8 分钟
第2443篇:AI能力的服务化与开放——对内共享和对外输出AI能力的工程方案
适读人群:AI平台架构师、技术总监 | 阅读时长:约13分钟 | 核心价值:掌握AI能力服务化的技术架构和治理机制,支持对内共享和对外开放
公司内部团队之间共享AI能力,和对外向合作伙伴开放AI能力——这两件事看起来不同,但背后的工程问题是一致的:如何把AI能力封装成可靠的、可管控的服务?
我们在做某个内部AI服务化项目时,起初只考虑了对内共享。服务上线后三个月,商务团队来找我,说有个大客户想直接集成我们的AI能力,问能不能开放API给他们。
我们看了看代码,发现完全没有考虑对外开放的场景:没有多租户隔离、没有细粒度的访问控制、没有使用量计量和限速、没有对外的文档。想支持对外开放,几乎要重新设计。
如果当时做服务化的时候,多想一步"这个服务未来是否可能对外开放",就能在架构上多预留一些空间,不至于后来这么被动。
一、AI能力服务化的三个层次
本文重点讨论L1和L2,这是大多数企业AI团队会面临的场景。
二、服务化的核心技术架构
2.1 AI服务网关
对内和对外都需要一个统一的API网关作为入口:
# AI服务网关的核心职责
AI_GATEWAY_RESPONSIBILITIES = {
"authentication": "验证调用方身份(API Key / OAuth2 / JWT)",
"authorization": "验证调用方是否有权访问这个能力",
"rate_limiting": "限制调用频率,防止滥用",
"quota_management": "管理每个调用方的用量配额",
"request_transformation": "统一的请求/响应格式转换",
"observability": "请求日志、监控、告警",
"routing": "将请求路由到正确的后端AI服务"
}
# 网关的技术实现方案
GATEWAY_OPTIONS = {
"kong": {
"type": "开源网关",
"ai_specific_features": "Kong AI Gateway插件支持LLM特有功能",
"suitable_for": "已有Kong基础设施的团队"
},
"azure_api_management": {
"type": "托管网关",
"ai_specific_features": "内置Azure OpenAI访问控制,与Azure生态集成好",
"suitable_for": "Azure用户"
},
"custom_gateway": {
"type": "自研",
"suitable_for": "有特殊需求的团队",
"warning": "自研成本高,维护负担重,谨慎选择"
}
}2.2 多租户隔离设计
多租户是AI服务化的核心工程挑战:
from enum import Enum
from typing import Optional
import hashlib
class IsolationLevel(Enum):
"""隔离级别"""
SHARED = "shared" # 完全共享,仅逻辑隔离
NAMESPACE = "namespace" # 命名空间隔离
DEDICATED = "dedicated" # 独立实例
class TenantConfig:
"""租户配置"""
def __init__(self, tenant_id: str, tier: str = "standard"):
self.tenant_id = tenant_id
self.tier = tier
def get_isolation_level(self) -> IsolationLevel:
tier_isolation = {
"free": IsolationLevel.SHARED,
"standard": IsolationLevel.NAMESPACE,
"enterprise": IsolationLevel.DEDICATED
}
return tier_isolation.get(self.tier, IsolationLevel.SHARED)
def get_rate_limits(self) -> dict:
tier_limits = {
"free": {"rpm": 10, "tpm": 10000, "daily_tokens": 100000},
"standard": {"rpm": 100, "tpm": 100000, "daily_tokens": 1000000},
"enterprise": {"rpm": 1000, "tpm": 1000000, "daily_tokens": None} # 无限制
}
return tier_limits.get(self.tier, tier_limits["free"])
class AIServiceRouter:
"""AI服务路由器:根据租户配置路由请求"""
def __init__(self):
self.tenant_registry = {} # tenant_id -> TenantConfig
def get_tenant_endpoint(self, tenant_id: str) -> str:
"""获取租户对应的后端服务端点"""
config = self.tenant_registry.get(tenant_id)
if not config:
raise ValueError(f"Unknown tenant: {tenant_id}")
isolation = config.get_isolation_level()
if isolation == IsolationLevel.DEDICATED:
# 独立实例,每个租户有自己的后端
return f"http://ai-service-{tenant_id}.internal"
elif isolation == IsolationLevel.NAMESPACE:
# 命名空间隔离,共享后端但逻辑隔离
return "http://ai-service-shared.internal"
else:
# 完全共享
return "http://ai-service-shared.internal"
def inject_tenant_context(self, request: dict, tenant_id: str) -> dict:
"""在请求中注入租户上下文"""
config = self.tenant_registry.get(tenant_id)
isolation = config.get_isolation_level() if config else IsolationLevel.SHARED
# 在namespace和shared模式下,需要在请求中携带租户标识
if isolation in [IsolationLevel.NAMESPACE, IsolationLevel.SHARED]:
request["headers"] = request.get("headers", {})
request["headers"]["X-Tenant-ID"] = tenant_id
return request2.3 API Key管理系统
import secrets
import time
from typing import Optional
class APIKeyManager:
"""API Key管理"""
def generate_api_key(self,
tenant_id: str,
description: str,
expires_days: Optional[int] = 365,
scopes: list = None) -> dict:
"""
生成API Key
scopes: 允许访问的能力范围,如 ["text-generation", "embedding"]
"""
key_value = f"sk-{secrets.token_urlsafe(32)}"
api_key = {
"key_id": f"key_{secrets.token_hex(8)}",
"key_value": key_value, # 只在创建时返回,之后不再明文存储
"key_hash": self._hash_key(key_value), # 存储hash
"tenant_id": tenant_id,
"description": description,
"scopes": scopes or ["*"],
"created_at": int(time.time()),
"expires_at": int(time.time()) + (expires_days * 86400) if expires_days else None,
"status": "active",
"last_used_at": None,
"usage_count": 0
}
# 存储(只存hash,不存明文)
self._store_key(api_key)
return {
"key_id": api_key["key_id"],
"key_value": key_value, # 只在创建时返回
"warning": "请立即保存,此key不会再次显示"
}
def validate_key(self, key_value: str) -> Optional[dict]:
"""验证API Key"""
key_hash = self._hash_key(key_value)
key_record = self._find_by_hash(key_hash)
if not key_record:
return None
if key_record["status"] != "active":
return None
# 检查过期
if key_record.get("expires_at") and time.time() > key_record["expires_at"]:
self._update_status(key_record["key_id"], "expired")
return None
# 更新使用记录
self._update_usage(key_record["key_id"])
return key_record
def _hash_key(self, key_value: str) -> str:
return hashlib.sha256(key_value.encode()).hexdigest()
def _store_key(self, key_record: dict):
# 实际实现:存储到数据库
pass
def _find_by_hash(self, key_hash: str) -> Optional[dict]:
# 实际实现:从数据库查询
pass
def _update_status(self, key_id: str, status: str):
pass
def _update_usage(self, key_id: str):
pass三、使用量计量和成本分摊
无论对内还是对外,都需要准确计量使用量:
class UsageTracker:
"""使用量追踪器"""
def record_api_call(self,
tenant_id: str,
api_key_id: str,
capability: str,
request_metadata: dict,
response_metadata: dict) -> dict:
"""
记录一次API调用的使用量
capability: 使用的AI能力(如 "text-generation", "embedding")
request_metadata: 请求相关信息(prompt长度等)
response_metadata: 响应相关信息(completion长度、延迟等)
"""
usage_record = {
"tenant_id": tenant_id,
"api_key_id": api_key_id,
"capability": capability,
"timestamp": int(time.time()),
"input_tokens": request_metadata.get("input_tokens", 0),
"output_tokens": response_metadata.get("output_tokens", 0),
"total_tokens": (request_metadata.get("input_tokens", 0) +
response_metadata.get("output_tokens", 0)),
"latency_ms": response_metadata.get("latency_ms"),
"status": response_metadata.get("status", "success"),
"model_used": response_metadata.get("model"),
"cost_usd": self._calculate_cost(
capability,
request_metadata.get("input_tokens", 0),
response_metadata.get("output_tokens", 0),
response_metadata.get("model")
)
}
# 异步写入到时序数据库
self._write_to_storage_async(usage_record)
return usage_record
def _calculate_cost(self, capability: str, input_tokens: int,
output_tokens: int, model: str) -> float:
"""计算实际成本(按现有LLM定价)"""
# 示意性价格,实际应从配置读取
pricing = {
"gpt-4o": {"input": 0.0025 / 1000, "output": 0.010 / 1000}, # USD per token
"claude-3-5-sonnet": {"input": 0.003 / 1000, "output": 0.015 / 1000},
"text-embedding-ada-002": {"input": 0.0001 / 1000, "output": 0}
}
model_pricing = pricing.get(model, {"input": 0, "output": 0})
return (input_tokens * model_pricing["input"] +
output_tokens * model_pricing["output"])
def get_tenant_usage_report(self, tenant_id: str,
start_time: int, end_time: int) -> dict:
"""获取租户使用报告"""
records = self._query_usage(tenant_id, start_time, end_time)
if not records:
return {"tenant_id": tenant_id, "total_calls": 0, "total_tokens": 0, "total_cost_usd": 0}
return {
"tenant_id": tenant_id,
"period": {"start": start_time, "end": end_time},
"total_calls": len(records),
"total_tokens": sum(r["total_tokens"] for r in records),
"total_cost_usd": sum(r["cost_usd"] for r in records),
"by_capability": self._group_by_capability(records),
"by_day": self._group_by_day(records)
}
def _write_to_storage_async(self, record: dict):
pass
def _query_usage(self, tenant_id: str, start_time: int, end_time: int) -> list:
return []
def _group_by_capability(self, records: list) -> dict:
result = {}
for r in records:
cap = r["capability"]
if cap not in result:
result[cap] = {"calls": 0, "tokens": 0, "cost_usd": 0}
result[cap]["calls"] += 1
result[cap]["tokens"] += r["total_tokens"]
result[cap]["cost_usd"] += r["cost_usd"]
return result
def _group_by_day(self, records: list) -> dict:
from datetime import datetime
result = {}
for r in records:
day = datetime.fromtimestamp(r["timestamp"]).strftime("%Y-%m-%d")
if day not in result:
result[day] = {"calls": 0, "tokens": 0}
result[day]["calls"] += 1
result[day]["tokens"] += r["total_tokens"]
return result四、对外开放的特殊要求
相比对内共享,对外开放还需要额外关注:
EXTERNAL_API_ADDITIONAL_REQUIREMENTS = {
"documentation": {
"required": [
"OpenAPI/Swagger规范文档",
"入门教程(Getting Started)",
"API参考文档(每个接口的参数、返回值、错误码)",
"使用示例代码(Python/Java/Node.js等)",
"错误处理指南",
"速率限制说明"
],
"important": "文档质量直接决定外部开发者的采用率"
},
"versioning": {
"policy": """
API版本必须遵循语义化版本控制:
- 破坏性变更:升大版本(v1 -> v2),旧版本保留至少12个月
- 新增功能:升小版本(v1.0 -> v1.1),向后兼容
- Bug修复:升补丁版本
""",
"implementation": "URL路径包含大版本号:/api/v1/..."
},
"abuse_prevention": {
"measures": [
"内容过滤:检测并拒绝有害请求",
"速率限制:防止单用户占用过多资源",
"成本上限:每个API Key设置月度消费上限",
"用量异常检测:发现不正常的调用模式",
"CAPTCHA/人机验证(对高敏感操作)"
]
},
"support_model": {
"required": [
"状态页(https://status.yourdomain.com)",
"开发者社区(Slack/Discord/论坛)",
"Issue追踪(GitHub Issues或工单系统)",
"变更日志(每次API变更的记录)"
]
}
}五、能力目录的设计
服务化之后,需要一个清晰的能力目录让使用者了解有哪些AI能力可用:
AI_CAPABILITY_CATALOG = {
"text_generation": {
"name": "文本生成",
"description": "基于提示词生成文本内容,支持多种风格和格式",
"use_cases": ["内容创作", "问答", "摘要", "翻译"],
"endpoint": "/v1/generate",
"pricing_unit": "按token计费",
"latency_typical": "500ms - 5s",
"availability_sla": "99.9%",
"models_available": ["gpt-4o", "claude-3-5-sonnet", "qwen-max"],
"rate_limits": {
"free": "10 RPM",
"standard": "100 RPM",
"enterprise": "自定义"
}
},
"embedding": {
"name": "向量嵌入",
"description": "将文本转换为向量表示,用于语义搜索和相似度计算",
"use_cases": ["语义搜索", "文档聚类", "相似内容推荐"],
"endpoint": "/v1/embed",
"pricing_unit": "按token计费",
"latency_typical": "50ms - 200ms",
"availability_sla": "99.9%"
},
"document_qa": {
"name": "文档问答",
"description": "基于上传的文档集合回答问题(RAG)",
"use_cases": ["企业知识库", "合同分析", "技术文档查询"],
"endpoint": "/v1/document-qa",
"pricing_unit": "按查询次数 + token计费",
"note": "需要先上传文档建立知识库"
}
}六、从对内到对外的渐进路径
不需要一步到位,可以分阶段演进:
阶段一:对内共享(0-6个月)
- 建立基础API网关
- 实现基本的认证和访问控制
- 部署使用量监控
- 内部文档和支持
阶段二:内部平台成熟(6-18个月)
- 完善多租户隔离
- 建立SLA机制
- 精确的计量和成本分摊
- 开发者体验优化
阶段三:合作伙伴开放(18个月+)
- 完整的对外文档
- 正式SLA和合同
- 合作伙伴支持体系
- 滥用防护AI能力服务化是AI工程成熟度的重要标志。从内部混乱的直接调用,到有治理的服务化交付,再到对外开放赋能生态——这条路每一步都有实实在在的工程价值。关键是不要等到"完美"再开始,先建立基础框架,在实际使用中迭代。
