第2296篇：AI基础设施即代码——用Terraform和Kubernetes管理AI服务

老张2026/4/30大约 4 分钟

第2296篇：AI基础设施即代码——用Terraform和Kubernetes管理AI服务

适读人群：需要管理AI服务基础设施的工程师和DevOps工程师 | 阅读时长：约14分钟 | 核心价值：掌握用IaC方式管理AI服务的核心资源，实现可重复、可审计的AI基础设施

我们团队有段时间的AI基础设施是手动配置的：Redis配置是运维同学登上去手动设的，Kubernetes的HPA阈值是某个工程师某天改的，向量数据库的分片配置……谁也说不清楚当时为什么这样设。

有一次我们要在新环境里复现生产环境，花了整整两天才搞定，而且还不能保证完全一样。

基础设施即代码（IaC）在AI服务上的价值：让AI服务的基础设施像代码一样可版本控制、可审查、可重复部署。

Terraform管理AI服务的云资源

以AWS为例，AI服务通常需要的云资源：

# terraform/ai-infrastructure/main.tf

terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
  }
  backend "s3" {
    bucket = "your-terraform-state-bucket"
    key    = "ai-infrastructure/terraform.tfstate"
    region = "us-east-1"
  }
}

# 变量定义
variable "environment" {
  description = "部署环境：dev/staging/prod"
  type        = string
}

variable "ai_instance_type" {
  description = "AI服务的EC2实例类型"
  type        = string
  default     = "c5.2xlarge"  # 8核16G，适合CPU推理
}

# EKS集群（运行AI服务）
module "eks" {
  source          = "terraform-aws-modules/eks/aws"
  version         = "~> 20.0"
  
  cluster_name    = "ai-platform-${var.environment}"
  cluster_version = "1.29"
  
  vpc_id          = module.vpc.vpc_id
  subnet_ids      = module.vpc.private_subnets
  
  # AI工作节点组
  eks_managed_node_groups = {
    ai_workers = {
      min_size     = 2
      max_size     = 20
      desired_size = 3
      
      instance_types = [var.ai_instance_type]
      
      labels = {
        role = "ai-worker"
      }
      
      taints = [{
        key    = "ai-workload"
        value  = "true"
        effect = "NO_SCHEDULE"
      }]
    }
  }
}

# ElastiCache Redis（存储对话上下文）
resource "aws_elasticache_replication_group" "ai_context_cache" {
  replication_group_id = "ai-context-${var.environment}"
  description          = "AI对话上下文缓存"
  
  engine               = "redis"
  engine_version       = "7.2"
  node_type            = "cache.r7g.large"  # 内存优化实例
  num_cache_clusters   = var.environment == "prod" ? 3 : 1
  
  # 开启集群模式（生产）
  automatic_failover_enabled = var.environment == "prod"
  multi_az_enabled           = var.environment == "prod"
  
  # 加密
  at_rest_encryption_enabled  = true
  transit_encryption_enabled  = true
  
  parameter_group_name = aws_elasticache_parameter_group.ai_redis_params.name
  
  tags = {
    Environment = var.environment
    Service     = "ai-platform"
  }
}

# Redis参数配置
resource "aws_elasticache_parameter_group" "ai_redis_params" {
  name   = "ai-redis-params-${var.environment}"
  family = "redis7"
  
  parameter {
    name  = "maxmemory-policy"
    value = "allkeys-lru"  # LRU淘汰策略，适合缓存场景
  }
  
  parameter {
    name  = "timeout"
    value = "300"  # 5分钟空闲超时
  }
}

# RDS PostgreSQL（存储向量，使用pgvector扩展）
resource "aws_db_instance" "ai_vector_db" {
  identifier = "ai-vector-db-${var.environment}"
  
  engine               = "postgres"
  engine_version       = "16.1"
  instance_class       = var.environment == "prod" ? "db.r7g.2xlarge" : "db.t3.medium"
  
  allocated_storage    = 500  # GB
  max_allocated_storage = 2000  # 自动扩容上限
  storage_encrypted    = true
  
  db_name  = "aidb"
  username = "aidbadmin"
  password = var.db_password  # 从SSM Parameter Store或Vault获取
  
  # 高可用（生产）
  multi_az = var.environment == "prod"
  
  backup_retention_period = 7
  
  performance_insights_enabled = true
  
  tags = {
    Environment = var.environment
    Service     = "ai-platform"
  }
}

# Secrets Manager：存储AI API Key
resource "aws_secretsmanager_secret" "ai_api_keys" {
  name = "ai-platform/${var.environment}/api-keys"
  description = "AI服务API密钥"
  
  recovery_window_in_days = var.environment == "prod" ? 30 : 0
}

# S3桶：存储AI训练数据和模型文件
resource "aws_s3_bucket" "ai_artifacts" {
  bucket = "ai-artifacts-${var.environment}-${random_id.suffix.hex}"
  
  tags = {
    Environment = var.environment
    Purpose     = "ai-artifacts"
  }
}

resource "aws_s3_bucket_versioning" "ai_artifacts_versioning" {
  bucket = aws_s3_bucket.ai_artifacts.id
  versioning_configuration {
    status = "Enabled"
  }
}

Kubernetes配置AI服务部署

# k8s/ai-service/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ai-conversation-service
  namespace: ai-platform
  labels:
    app: ai-conversation-service
    version: "{{ .Values.imageTag }}"
spec:
  replicas: {{ .Values.replicas.initial }}
  selector:
    matchLabels:
      app: ai-conversation-service
  
  template:
    metadata:
      labels:
        app: ai-conversation-service
      annotations:
        # Prometheus指标采集
        prometheus.io/scrape: "true"
        prometheus.io/port: "9090"
        prometheus.io/path: "/actuator/prometheus"
    
    spec:
      # 只调度到AI工作节点
      nodeSelector:
        role: ai-worker
      tolerations:
      - key: "ai-workload"
        operator: "Equal"
        value: "true"
        effect: "NoSchedule"
      
      # 优雅关闭：等待进行中的AI请求完成
      terminationGracePeriodSeconds: 120
      
      containers:
      - name: ai-conversation-service
        image: "{{ .Values.image.repository }}:{{ .Values.imageTag }}"
        
        ports:
        - containerPort: 8080
          name: http
        - containerPort: 9090
          name: metrics
        
        # 资源限制
        resources:
          requests:
            cpu: "1"
            memory: "2Gi"
          limits:
            cpu: "4"
            memory: "6Gi"
        
        # 环境变量
        env:
        - name: SPRING_PROFILES_ACTIVE
          value: {{ .Values.environment }}
        - name: REDIS_HOST
          valueFrom:
            secretKeyRef:
              name: ai-service-secrets
              key: redis-host
        - name: ANTHROPIC_API_KEY
          valueFrom:
            secretKeyRef:
              name: ai-api-keys
              key: anthropic-key
        
        # 就绪探针（影响流量路由）
        readinessProbe:
          httpGet:
            path: /actuator/health/readiness
            port: 8080
          initialDelaySeconds: 30  # AI服务启动较慢
          periodSeconds: 10
          failureThreshold: 3
        
        # 存活探针（触发Pod重启）
        livenessProbe:
          httpGet:
            path: /actuator/health/liveness
            port: 8080
          initialDelaySeconds: 60
          periodSeconds: 30
          failureThreshold: 3

# k8s/ai-service/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: ai-conversation-service-hpa
  namespace: ai-platform
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: ai-conversation-service
  
  minReplicas: {{ .Values.replicas.min }}
  maxReplicas: {{ .Values.replicas.max }}
  
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  
  # 自定义指标：基于AI请求队列深度
  - type: External
    external:
      metric:
        name: kafka_consumer_group_lag
        selector:
          matchLabels:
            consumer_group: ai-conversation-service
      target:
        type: AverageValue
        averageValue: "50"

Helm Chart管理多环境配置差异

# helm/ai-platform/values-prod.yaml
environment: prod
imageTag: "1.5.2"

replicas:
  initial: 5
  min: 3
  max: 30

resources:
  requests:
    cpu: "2"
    memory: "4Gi"
  limits:
    cpu: "8"
    memory: "12Gi"

# 生产环境启用Prompt缓存
ai:
  promptCaching:
    enabled: true
  rateLimiting:
    requestsPerMinute: 1000
    burstCapacity: 200

# 生产环境启用完整的可观测性
monitoring:
  tracing:
    enabled: true
    samplingRate: 0.1  # 采样10%的请求
  logging:
    level: INFO

---
# helm/ai-platform/values-dev.yaml
environment: dev

replicas:
  initial: 1
  min: 1
  max: 3

resources:
  requests:
    cpu: "0.5"
    memory: "512Mi"
  limits:
    cpu: "2"
    memory: "2Gi"

ai:
  rateLimiting:
    requestsPerMinute: 60
    burstCapacity: 10

monitoring:
  tracing:
    enabled: true
    samplingRate: 1.0  # 开发环境全量采样
  logging:
    level: DEBUG

GitOps流水线

IaC的完整价值需要GitOps才能体现：所有基础设施变更通过PR审查，自动部署：

# .github/workflows/deploy-ai-infra.yml
name: Deploy AI Infrastructure

on:
  push:
    branches: [main]
    paths: ['terraform/**', 'k8s/**', 'helm/**']

jobs:
  terraform-plan:
    name: Terraform Plan
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    
    - name: Setup Terraform
      uses: hashicorp/setup-terraform@v3
    
    - name: Terraform Plan
      run: |
        cd terraform/ai-infrastructure
        terraform init
        terraform plan -var="environment=prod" -out=tfplan
    
    - name: Comment Plan on PR
      uses: actions/github-script@v7
      with:
        script: |
          github.rest.issues.createComment({
            issue_number: context.issue.number,
            body: "Terraform plan已生成，请review后批准部署"
          })
  
  terraform-apply:
    name: Terraform Apply
    needs: terraform-plan
    runs-on: ubuntu-latest
    environment: production  # 需要人工审批
    steps:
    - name: Terraform Apply
      run: |
        cd terraform/ai-infrastructure
        terraform apply tfplan
  
  helm-deploy:
    name: Deploy AI Services
    needs: terraform-apply
    runs-on: ubuntu-latest
    steps:
    - name: Deploy to Kubernetes
      run: |
        helm upgrade --install ai-platform ./helm/ai-platform \
          -f helm/ai-platform/values-prod.yaml \
          --namespace ai-platform \
          --create-namespace \
          --wait \
          --timeout 5m

把AI基础设施纳入IaC管理，最直接的收益是：新环境（dev/staging/prod）的搭建从"几天"变成"一条命令"。间接收益是：所有变更都有记录，出了问题可以快速回滚，团队之间对基础设施的理解也更一致。