Package ML applications in portable containers
Docker solves the "works on my machine" problem by packaging everything needed to run your application. **Benefits:** - **Reproducibility**: Consistent environment everywhere - **Portability**: Run anywhere (local, cloud, edge) - **Isolation**: Dependencies don't conflict - **Scalability**: Easy to replicate and scale - **Version Control**: Tag and track container versions **Docker Components:** - **Image**: Blueprint (recipe) for containers - **Container**: Running instance of an image - **Dockerfile**: Instructions to build an image - **Registry**: Storage for images (Docker Hub, ECR, GCR) **ML-Specific Challenges:** - Large model files - GPU support - Python dependencies - Long build times **Solutions:** - Multi-stage builds - Layer caching - NVIDIA Container Toolkit for GPU
Create a production-ready Docker image:
# Multi-stage build for smaller final image
# Stage 1: Builder
FROM python:3.9-slim as builder
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first (for layer caching)
COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir --user -r requirements.txt
# Stage 2: Runtime
FROM python:3.9-slim
# Create non-root user for security
RUN useradd -m -u 1000 mluser
# Set working directory
WORKDIR /app
# Copy Python packages from builder
COPY --from=builder /root/.local /home/mluser/.local
# Copy application code
COPY --chown=mluser:mluser . .
# Copy model files
COPY --chown=mluser:mluser models/ /app/models/
# Set environment variables
ENV PATH=/home/mluser/.local/bin:$PATH \
PYTHONUNBUFFERED=1 \
MODEL_PATH=/app/models/model.joblib
# Expose port
EXPOSE 8000
# Switch to non-root user
USER mluser
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=40s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Run application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]# This is a Dockerfile - no output when viewing # Build and run with: # docker build -t ml-service:1.0 . # docker run -p 8000:8000 ml-service:1.0
Essential Docker commands for ML deployment:
#!/bin/bash
# ===================================
# Building Docker Images
# ===================================
# Build image
docker build -t ml-service:1.0 .
# Build with build arguments
docker build --build-arg MODEL_VERSION=1.2.0 -t ml-service:1.2 .
# Build with no cache (fresh build)
docker build --no-cache -t ml-service:1.0 .
# Multi-platform build (for ARM and x86)
docker buildx build --platform linux/amd64,linux/arm64 -t ml-service:1.0 .
# ===================================
# Running Containers
# ===================================
# Run container
docker run -d -p 8000:8000 --name ml-api ml-service:1.0
# Run with environment variables
docker run -d \
-p 8000:8000 \
-e MODEL_VERSION=1.0 \
-e LOG_LEVEL=INFO \
--name ml-api \
ml-service:1.0
# Run with volume mount (for models)
docker run -d \
-p 8000:8000 \
-v $(pwd)/models:/app/models \
--name ml-api \
ml-service:1.0
# Run with GPU support (NVIDIA)
docker run -d \
--gpus all \
-p 8000:8000 \
--name ml-api-gpu \
ml-service:1.0-gpu
# Run with resource limits
docker run -d \
--memory="2g" \
--cpus="2.0" \
-p 8000:8000 \
--name ml-api \
ml-service:1.0
# ===================================
# Container Management
# ===================================
# List running containers
docker ps
# List all containers (including stopped)
docker ps -a
# View container logs
docker logs ml-api
# Follow logs in real-time
docker logs -f ml-api
# Execute command in running container
docker exec -it ml-api bash
# Stop container
docker stop ml-api
# Start stopped container
docker start ml-api
# Remove container
docker rm ml-api
# Remove container forcefully
docker rm -f ml-api
# ===================================
# Image Management
# ===================================
# List images
docker images
# Remove image
docker rmi ml-service:1.0
# Tag image
docker tag ml-service:1.0 myregistry.com/ml-service:1.0
# Push to registry
docker push myregistry.com/ml-service:1.0
# Pull from registry
docker pull myregistry.com/ml-service:1.0
# ===================================
# Docker Compose (Multi-container)
# ===================================
# Start services defined in docker-compose.yml
docker-compose up -d
# Stop services
docker-compose down
# View logs
docker-compose logs -f
# Scale service
docker-compose up -d --scale api=3
# ===================================
# Cleanup
# ===================================
# Remove all stopped containers
docker container prune
# Remove unused images
docker image prune -a
# Remove all unused resources
docker system prune -a
# View disk usage
docker system df
echo "✓ Docker commands reference complete!"✓ Docker commands reference complete!
Deploy complete ML stack with Docker Compose:
# docker-compose.yml
version: '3.8'
services:
# ML API Service
api:
build:
context: .
dockerfile: Dockerfile
image: ml-service:1.0
container_name: ml-api
ports:
- "8000:8000"
environment:
- MODEL_PATH=/app/models/model.joblib
- REDIS_URL=redis://redis:6379
- POSTGRES_URL=postgresql://user:pass@postgres:5432/mldb
volumes:
- ./models:/app/models:ro # Read-only models
- ./logs:/app/logs # Logs
depends_on:
- redis
- postgres
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
deploy:
resources:
limits:
cpus: '2.0'
memory: 4G
reservations:
cpus: '1.0'
memory: 2G
# Redis for caching
redis:
image: redis:7-alpine
container_name: ml-redis
ports:
- "6379:6379"
volumes:
- redis-data:/data
command: redis-server --appendonly yes
restart: unless-stopped
# PostgreSQL for logging
postgres:
image: postgres:15-alpine
container_name: ml-postgres
environment:
- POSTGRES_USER=user
- POSTGRES_PASSWORD=pass
- POSTGRES_DB=mldb
ports:
- "5432:5432"
volumes:
- postgres-data:/var/lib/postgresql/data
restart: unless-stopped
# Prometheus for monitoring
prometheus:
image: prom/prometheus:latest
container_name: ml-prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
restart: unless-stopped
# Grafana for visualization
grafana:
image: grafana/grafana:latest
container_name: ml-grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-data:/var/lib/grafana
depends_on:
- prometheus
restart: unless-stopped
# Named volumes for data persistence
volumes:
redis-data:
postgres-data:
prometheus-data:
grafana-data:
# Network configuration
networks:
default:
driver: bridge# Start stack: docker-compose up -d # Stop stack: docker-compose down # View logs: docker-compose logs -f api # Access API: http://localhost:8000 # Access Grafana: http://localhost:3000 # Access Prometheus: http://localhost:9090
**Image Optimization:** 1. **Use Multi-stage Builds** - Separate build and runtime stages - Smaller final image (only runtime dependencies) - Faster deployments 2. **Minimize Layers** - Combine RUN commands - Clean up in same layer - Use .dockerignore 3. **Leverage Caching** - Order Dockerfile from least to most frequently changed - Copy requirements.txt before code - Use BuildKit for better caching **Security:** - Don't run as root - Scan images for vulnerabilities (Trivy, Snyk) - Use official base images - Keep images updated - Don't include secrets in images **GPU Support:** ```dockerfile FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04 # ... rest of Dockerfile ``` **Kubernetes Deployment:** - Use Helm charts - Configure resource requests/limits - Set up horizontal pod autoscaling - Use liveness/readiness probes