第2296篇:AI基础设施即代码——用Terraform和Kubernetes管理AI服务
2026/4/30大约 4 分钟
第2296篇:AI基础设施即代码——用Terraform和Kubernetes管理AI服务
适读人群:需要管理AI服务基础设施的工程师和DevOps工程师 | 阅读时长:约14分钟 | 核心价值:掌握用IaC方式管理AI服务的核心资源,实现可重复、可审计的AI基础设施
我们团队有段时间的AI基础设施是手动配置的:Redis配置是运维同学登上去手动设的,Kubernetes的HPA阈值是某个工程师某天改的,向量数据库的分片配置……谁也说不清楚当时为什么这样设。
有一次我们要在新环境里复现生产环境,花了整整两天才搞定,而且还不能保证完全一样。
基础设施即代码(IaC)在AI服务上的价值:让AI服务的基础设施像代码一样可版本控制、可审查、可重复部署。
Terraform管理AI服务的云资源
以AWS为例,AI服务通常需要的云资源:
# terraform/ai-infrastructure/main.tf
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
backend "s3" {
bucket = "your-terraform-state-bucket"
key = "ai-infrastructure/terraform.tfstate"
region = "us-east-1"
}
}
# 变量定义
variable "environment" {
description = "部署环境:dev/staging/prod"
type = string
}
variable "ai_instance_type" {
description = "AI服务的EC2实例类型"
type = string
default = "c5.2xlarge" # 8核16G,适合CPU推理
}
# EKS集群(运行AI服务)
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 20.0"
cluster_name = "ai-platform-${var.environment}"
cluster_version = "1.29"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
# AI工作节点组
eks_managed_node_groups = {
ai_workers = {
min_size = 2
max_size = 20
desired_size = 3
instance_types = [var.ai_instance_type]
labels = {
role = "ai-worker"
}
taints = [{
key = "ai-workload"
value = "true"
effect = "NO_SCHEDULE"
}]
}
}
}
# ElastiCache Redis(存储对话上下文)
resource "aws_elasticache_replication_group" "ai_context_cache" {
replication_group_id = "ai-context-${var.environment}"
description = "AI对话上下文缓存"
engine = "redis"
engine_version = "7.2"
node_type = "cache.r7g.large" # 内存优化实例
num_cache_clusters = var.environment == "prod" ? 3 : 1
# 开启集群模式(生产)
automatic_failover_enabled = var.environment == "prod"
multi_az_enabled = var.environment == "prod"
# 加密
at_rest_encryption_enabled = true
transit_encryption_enabled = true
parameter_group_name = aws_elasticache_parameter_group.ai_redis_params.name
tags = {
Environment = var.environment
Service = "ai-platform"
}
}
# Redis参数配置
resource "aws_elasticache_parameter_group" "ai_redis_params" {
name = "ai-redis-params-${var.environment}"
family = "redis7"
parameter {
name = "maxmemory-policy"
value = "allkeys-lru" # LRU淘汰策略,适合缓存场景
}
parameter {
name = "timeout"
value = "300" # 5分钟空闲超时
}
}
# RDS PostgreSQL(存储向量,使用pgvector扩展)
resource "aws_db_instance" "ai_vector_db" {
identifier = "ai-vector-db-${var.environment}"
engine = "postgres"
engine_version = "16.1"
instance_class = var.environment == "prod" ? "db.r7g.2xlarge" : "db.t3.medium"
allocated_storage = 500 # GB
max_allocated_storage = 2000 # 自动扩容上限
storage_encrypted = true
db_name = "aidb"
username = "aidbadmin"
password = var.db_password # 从SSM Parameter Store或Vault获取
# 高可用(生产)
multi_az = var.environment == "prod"
backup_retention_period = 7
performance_insights_enabled = true
tags = {
Environment = var.environment
Service = "ai-platform"
}
}
# Secrets Manager:存储AI API Key
resource "aws_secretsmanager_secret" "ai_api_keys" {
name = "ai-platform/${var.environment}/api-keys"
description = "AI服务API密钥"
recovery_window_in_days = var.environment == "prod" ? 30 : 0
}
# S3桶:存储AI训练数据和模型文件
resource "aws_s3_bucket" "ai_artifacts" {
bucket = "ai-artifacts-${var.environment}-${random_id.suffix.hex}"
tags = {
Environment = var.environment
Purpose = "ai-artifacts"
}
}
resource "aws_s3_bucket_versioning" "ai_artifacts_versioning" {
bucket = aws_s3_bucket.ai_artifacts.id
versioning_configuration {
status = "Enabled"
}
}Kubernetes配置AI服务部署
# k8s/ai-service/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ai-conversation-service
namespace: ai-platform
labels:
app: ai-conversation-service
version: "{{ .Values.imageTag }}"
spec:
replicas: {{ .Values.replicas.initial }}
selector:
matchLabels:
app: ai-conversation-service
template:
metadata:
labels:
app: ai-conversation-service
annotations:
# Prometheus指标采集
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
prometheus.io/path: "/actuator/prometheus"
spec:
# 只调度到AI工作节点
nodeSelector:
role: ai-worker
tolerations:
- key: "ai-workload"
operator: "Equal"
value: "true"
effect: "NoSchedule"
# 优雅关闭:等待进行中的AI请求完成
terminationGracePeriodSeconds: 120
containers:
- name: ai-conversation-service
image: "{{ .Values.image.repository }}:{{ .Values.imageTag }}"
ports:
- containerPort: 8080
name: http
- containerPort: 9090
name: metrics
# 资源限制
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "4"
memory: "6Gi"
# 环境变量
env:
- name: SPRING_PROFILES_ACTIVE
value: {{ .Values.environment }}
- name: REDIS_HOST
valueFrom:
secretKeyRef:
name: ai-service-secrets
key: redis-host
- name: ANTHROPIC_API_KEY
valueFrom:
secretKeyRef:
name: ai-api-keys
key: anthropic-key
# 就绪探针(影响流量路由)
readinessProbe:
httpGet:
path: /actuator/health/readiness
port: 8080
initialDelaySeconds: 30 # AI服务启动较慢
periodSeconds: 10
failureThreshold: 3
# 存活探针(触发Pod重启)
livenessProbe:
httpGet:
path: /actuator/health/liveness
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
failureThreshold: 3# k8s/ai-service/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ai-conversation-service-hpa
namespace: ai-platform
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ai-conversation-service
minReplicas: {{ .Values.replicas.min }}
maxReplicas: {{ .Values.replicas.max }}
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
# 自定义指标:基于AI请求队列深度
- type: External
external:
metric:
name: kafka_consumer_group_lag
selector:
matchLabels:
consumer_group: ai-conversation-service
target:
type: AverageValue
averageValue: "50"Helm Chart管理多环境配置差异
# helm/ai-platform/values-prod.yaml
environment: prod
imageTag: "1.5.2"
replicas:
initial: 5
min: 3
max: 30
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "8"
memory: "12Gi"
# 生产环境启用Prompt缓存
ai:
promptCaching:
enabled: true
rateLimiting:
requestsPerMinute: 1000
burstCapacity: 200
# 生产环境启用完整的可观测性
monitoring:
tracing:
enabled: true
samplingRate: 0.1 # 采样10%的请求
logging:
level: INFO
---
# helm/ai-platform/values-dev.yaml
environment: dev
replicas:
initial: 1
min: 1
max: 3
resources:
requests:
cpu: "0.5"
memory: "512Mi"
limits:
cpu: "2"
memory: "2Gi"
ai:
rateLimiting:
requestsPerMinute: 60
burstCapacity: 10
monitoring:
tracing:
enabled: true
samplingRate: 1.0 # 开发环境全量采样
logging:
level: DEBUGGitOps流水线
IaC的完整价值需要GitOps才能体现:所有基础设施变更通过PR审查,自动部署:
# .github/workflows/deploy-ai-infra.yml
name: Deploy AI Infrastructure
on:
push:
branches: [main]
paths: ['terraform/**', 'k8s/**', 'helm/**']
jobs:
terraform-plan:
name: Terraform Plan
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
- name: Terraform Plan
run: |
cd terraform/ai-infrastructure
terraform init
terraform plan -var="environment=prod" -out=tfplan
- name: Comment Plan on PR
uses: actions/github-script@v7
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
body: "Terraform plan已生成,请review后批准部署"
})
terraform-apply:
name: Terraform Apply
needs: terraform-plan
runs-on: ubuntu-latest
environment: production # 需要人工审批
steps:
- name: Terraform Apply
run: |
cd terraform/ai-infrastructure
terraform apply tfplan
helm-deploy:
name: Deploy AI Services
needs: terraform-apply
runs-on: ubuntu-latest
steps:
- name: Deploy to Kubernetes
run: |
helm upgrade --install ai-platform ./helm/ai-platform \
-f helm/ai-platform/values-prod.yaml \
--namespace ai-platform \
--create-namespace \
--wait \
--timeout 5m把AI基础设施纳入IaC管理,最直接的收益是:新环境(dev/staging/prod)的搭建从"几天"变成"一条命令"。间接收益是:所有变更都有记录,出了问题可以快速回滚,团队之间对基础设施的理解也更一致。
