From 0b5319b31c643e745bce4489aad2fd700746d0d3 Mon Sep 17 00:00:00 2001 From: leehwui Date: Mon, 8 Dec 2025 17:08:45 +0800 Subject: [PATCH] Add GPU support and improve Docker deployment - Add GPU deployment support with NVIDIA runtime - Update Dockerfile.allinone with GPU environment variables - Add comprehensive GPU_DEPLOYMENT.md guide - Make port 11434 (Ollama) optional for security - Update DEPLOYMENT.md with CPU and GPU deployment options - Simplify default docker run commands - Update healthcheck to only check web application - Add memory requirements documentation - Create MEMORY_REQUIREMENTS.md with model comparison - Add build-8b.sh script for lower memory usage - Document OOM troubleshooting steps - Improve Docker build process - Add BUILD_TROUBLESHOOTING.md for common issues - Add DISTRIBUTION.md for image distribution methods - Update .gitignore to exclude large binary files - Improve docker-entrypoint.sh with better diagnostics - Update .dockerignore to include ollama-linux-amd64.tgz - Add backup file exclusions to .gitignore --- .dockerignore | 2 + .gitignore | 10 +++ DEPLOYMENT.md | 113 ++++++++++++++++++++++++++++++++-- Dockerfile.allinone | 20 ++++-- build-8b.sh | 141 +++++++++++++++++++++++++++++++++++++++++++ build-allinone.sh | 43 +++++++++++-- docker-entrypoint.sh | 78 ++++++++++++++++++++++-- 7 files changed, 387 insertions(+), 20 deletions(-) create mode 100755 build-8b.sh diff --git a/.dockerignore b/.dockerignore index 5747494..c89988f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,6 +14,8 @@ build .idea *.md !README.md +# Include pre-downloaded Ollama binary for offline build +!ollama-linux-amd64.tgz local_docs examples outputs diff --git a/.gitignore b/.gitignore index 4cc51dc..5f09286 100644 --- a/.gitignore +++ b/.gitignore @@ -149,6 +149,16 @@ outputs/ *.log local_docs/ +# Docker build artifacts (DO NOT commit these - they are huge!) +ollama-models/ +*.tar +ollama-linux-amd64.tgz +system-prompt-optimizer-*.tar +*.tar.gz + +# Backup files from scripts +*.bak + # Node modules (if any frontend dependencies) node_modules/ package-lock.json diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md index 7c6060f..958e537 100644 --- a/DEPLOYMENT.md +++ b/DEPLOYMENT.md @@ -117,19 +117,22 @@ rsync -avP --progress system-prompt-optimizer-allinone.tar user@server:/path/ # 加载镜像(需要几分钟) docker load -i system-prompt-optimizer-allinone.tar +# 如果遇到权限错误,使用 sudo +# sudo docker load -i system-prompt-optimizer-allinone.tar + # 验证镜像已加载 docker images | grep system-prompt-optimizer ``` #### 步骤 7: 启动服务 +**CPU 模式(默认):** + ```bash -# 启动容器 +# 启动容器(推荐:仅暴露 Web 端口) docker run -d \ --name system-prompt-optimizer \ -p 8010:8010 \ - -p 11434:11434 \ - -v $(pwd)/outputs:/app/outputs \ --restart unless-stopped \ system-prompt-optimizer:allinone @@ -137,7 +140,41 @@ docker run -d \ docker logs -f system-prompt-optimizer ``` -**重要**:首次启动需要等待 30-60 秒,Ollama 服务需要初始化。 +**GPU 模式(推荐,如果有 NVIDIA GPU):** + +```bash +# 使用所有可用 GPU(推荐) +docker run -d \ + --name system-prompt-optimizer \ + --gpus all \ + -p 8010:8010 \ + --restart unless-stopped \ + system-prompt-optimizer:allinone + +# 或指定特定 GPU +docker run -d \ + --name system-prompt-optimizer \ + --gpus '"device=0"' \ + -p 8010:8010 \ + --restart unless-stopped \ + system-prompt-optimizer:allinone + +# 查看启动日志 +docker logs -f system-prompt-optimizer +``` + +**GPU 部署前提条件**: +- 已安装 NVIDIA 驱动 (`nvidia-smi` 可用) +- 已安装 NVIDIA Container Toolkit +- GPU 显存 ≥ 10GB (14b 模型) 或 ≥ 6GB (8b 模型) + +**详细 GPU 部署指南**: 参见 [GPU_DEPLOYMENT.md](GPU_DEPLOYMENT.md) + +**重要**: +- 首次启动需要等待 30-60 秒(CPU)或 10-20 秒(GPU),Ollama 服务需要初始化 +- GPU 模式下推理速度提升 5-10 倍 +- 端口 11434 (Ollama) 是可选的,仅在需要外部访问 Ollama 时暴露 +- 不暴露 11434 更安全,因为 Ollama API 没有身份验证 #### 步骤 8: 验证部署 @@ -225,7 +262,8 @@ docker run -d \ ### 端口映射 -- **8010**: Web 界面和 API 端口 +- **8010**: Web 界面和 API 端口(必需) +- **11434**: Ollama API 端口(可选,仅用于调试或外部访问 Ollama) ### 数据持久化 @@ -233,6 +271,71 @@ docker run -d \ ## 故障排查 +### 0. Docker 守护进程连接错误 + +**问题**: 运行 `docker` 命令时提示 "Cannot connect to the Docker daemon" + +**症状**: +``` +Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running? +``` + +**解决方案**: + +**方法 1: 检查 Docker 服务状态** +```bash +# 检查 Docker 是否运行 +sudo systemctl status docker + +# 如果未运行,启动它 +sudo systemctl start docker + +# 设置开机自启 +sudo systemctl enable docker +``` + +**方法 2: 添加用户到 docker 组(推荐)** +```bash +# 将当前用户添加到 docker 组 +sudo usermod -aG docker $USER + +# 应用组变更(需要重新登录或使用 newgrp) +newgrp docker + +# 或者直接注销并重新登录 + +# 验证 +docker info +``` + +**方法 3: 修复 Docker socket 权限** +```bash +# 检查 socket 权限 +ls -l /var/run/docker.sock + +# 修复权限 +sudo chown root:docker /var/run/docker.sock +sudo chmod 660 /var/run/docker.sock +``` + +**方法 4: 临时使用 sudo** +```bash +# 如果上述方法不可行,使用 sudo 运行 Docker 命令 +sudo docker load -i system-prompt-optimizer-allinone.tar +sudo docker run -d --name system-prompt-optimizer ... +``` + +**验证修复**: +```bash +# 应该能正常显示 Docker 信息 +docker info + +# 应该能看到当前用户在 docker 组中 +groups | grep docker +``` + +--- + ### 1. 无法连接 Ollama 服务 **问题**: 容器内无法访问宿主机的 Ollama 服务 diff --git a/Dockerfile.allinone b/Dockerfile.allinone index 4fa0671..b2ef400 100644 --- a/Dockerfile.allinone +++ b/Dockerfile.allinone @@ -1,16 +1,20 @@ -FROM python:3.10-slim +FROM --platform=linux/amd64 python:3.10-slim # Set working directory WORKDIR /app -# Install system dependencies including curl for Ollama +# Install system dependencies RUN apt-get update && apt-get install -y \ curl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* -# Install Ollama -RUN curl -fsSL https://ollama.com/install.sh | sh +# Install Ollama manually for amd64 +# Copy pre-downloaded Ollama binary to avoid slow downloads during build +# Using v0.13.1 (latest stable as of Dec 2024) +COPY ollama-linux-amd64.tgz /tmp/ollama-linux-amd64.tgz +RUN tar -C /usr -xzf /tmp/ollama-linux-amd64.tgz \ + && rm /tmp/ollama-linux-amd64.tgz # Copy requirements file COPY requirements.txt . @@ -36,14 +40,18 @@ EXPOSE 8010 11434 # Set environment variables ENV PYTHONUNBUFFERED=1 ENV OLLAMA_HOST=http://localhost:11434 +# Enable GPU support for Ollama (will auto-detect NVIDIA GPU if available) +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility # Copy startup script COPY docker-entrypoint.sh /docker-entrypoint.sh RUN chmod +x /docker-entrypoint.sh # Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ - CMD curl -f http://localhost:8010/health && curl -f http://localhost:11434/api/tags || exit 1 +# Only check the web application, not Ollama (internal service) +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:8010/health || exit 1 # Run the startup script ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/build-8b.sh b/build-8b.sh new file mode 100755 index 0000000..b37b14d --- /dev/null +++ b/build-8b.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +# Quick build script for qwen3:8b (lower memory usage) +# Use this if your server has less than 12GB RAM + +set -e + +echo "==========================================" +echo "Building with qwen3:8b (Lower Memory)" +echo "==========================================" +echo "" +echo "Memory requirements:" +echo " - qwen3:8b: ~5GB RAM" +echo " - qwen3:14b: ~10GB RAM" +echo "" + +# Check if 8b model is available +if ! ollama list | grep -q "qwen3:8b"; then + echo "ERROR: qwen3:8b model not found!" + echo "" + echo "Please download it first:" + echo " ollama pull qwen3:8b" + echo "" + exit 1 +fi + +# Clean up +echo "Cleaning up previous builds..." +rm -rf ollama-models/ +docker rmi system-prompt-optimizer:allinone 2>/dev/null || true + +# Export 8b model +echo "" +echo "Exporting qwen3:8b model..." +mkdir -p ollama-models/models/{manifests/registry.ollama.ai/library,blobs} + +# Function to get blob hashes from manifest +get_blobs_from_manifest() { + local manifest_file=$1 + grep -o 'sha256:[a-f0-9]\{64\}' "$manifest_file" | sed 's/sha256://' | sort -u +} + +# Function to copy model files +copy_model() { + local model_name=$1 + local model_tag=$2 + local manifest_dir="$HOME/.ollama/models/manifests/registry.ollama.ai/library/$model_name" + + if [ ! -d "$manifest_dir" ]; then + echo "ERROR: Model manifest not found: $manifest_dir" + return 1 + fi + + echo " Copying $model_name:$model_tag manifest..." + mkdir -p "ollama-models/models/manifests/registry.ollama.ai/library/$model_name" + + if [ -f "$manifest_dir/$model_tag" ]; then + cp "$manifest_dir/$model_tag" "ollama-models/models/manifests/registry.ollama.ai/library/$model_name/" + + echo " Finding blob files for $model_name:$model_tag..." + local blob_hashes=$(get_blobs_from_manifest "$manifest_dir/$model_tag") + local blob_count=0 + + for blob_hash in $blob_hashes; do + local blob_file="$HOME/.ollama/models/blobs/sha256-$blob_hash" + if [ -f "$blob_file" ]; then + cp "$blob_file" "ollama-models/models/blobs/" 2>/dev/null + blob_count=$((blob_count + 1)) + fi + done + + echo " ✓ $model_name:$model_tag copied ($blob_count blobs)" + else + echo "ERROR: Manifest file not found: $manifest_dir/$model_tag" + return 1 + fi +} + +# Copy models +copy_model "qwen3" "8b" || exit 1 +copy_model "qwen3-embedding" "4b" || exit 1 + +echo "" +echo "✓ Models exported successfully" +echo "" + +# Update config.py to use 8b +echo "Updating config.py to use qwen3:8b..." +sed -i.bak 's/DEFAULT_CHAT_MODEL = "qwen3:14b"/DEFAULT_CHAT_MODEL = "qwen3:8b"/' config.py + +# Update docker-entrypoint.sh to check for 8b +echo "Updating docker-entrypoint.sh to check for qwen3:8b..." +sed -i.bak 's/qwen3:14b/qwen3:8b/g' docker-entrypoint.sh + +# Build image +echo "" +echo "Building Docker image..." +docker build --platform linux/amd64 \ + -f Dockerfile.allinone \ + -t system-prompt-optimizer:allinone . + +if [ $? -ne 0 ]; then + echo "" + echo "Build failed!" + # Restore backups + mv config.py.bak config.py + mv docker-entrypoint.sh.bak docker-entrypoint.sh + exit 1 +fi + +# Export image +echo "" +echo "Exporting Docker image..." +docker save -o system-prompt-optimizer-allinone.tar system-prompt-optimizer:allinone + +# Restore original files +mv config.py.bak config.py +mv docker-entrypoint.sh.bak docker-entrypoint.sh + +echo "" +echo "==========================================" +echo "Build Complete!" +echo "==========================================" +ls -lh system-prompt-optimizer-allinone.tar +echo "" +echo "This image uses qwen3:8b (~5GB RAM required)" +echo "" +echo "Transfer to server and run:" +echo "" +echo " CPU mode:" +echo " docker load -i system-prompt-optimizer-allinone.tar" +echo " docker run -d -p 8010:8010 --restart unless-stopped system-prompt-optimizer:allinone" +echo "" +echo " GPU mode (recommended):" +echo " docker load -i system-prompt-optimizer-allinone.tar" +echo " docker run -d --gpus all -p 8010:8010 --restart unless-stopped system-prompt-optimizer:allinone" +echo "" +echo "Note: GPU mode provides 5-10x faster inference." +echo " See GPU_DEPLOYMENT.md for GPU setup instructions." +echo "" + diff --git a/build-allinone.sh b/build-allinone.sh index 55e154f..aa99c50 100755 --- a/build-allinone.sh +++ b/build-allinone.sh @@ -15,12 +15,18 @@ echo "==========================================" echo "" echo "This will create a Docker image containing:" echo " - Python application" -echo " - Ollama service" +echo " - Ollama service (v0.13.1)" echo " - qwen3:14b model" echo " - qwen3-embedding:4b model" echo "" +echo "Target platform: linux/amd64 (x86_64)" +echo "" echo "WARNING: The final image will be 10-20GB in size!" echo "" +echo "NOTE: If you're building on Apple Silicon (M1/M2/M3)," +echo " Docker will use emulation which may be slower." +echo " The image will still work on x86_64 servers." +echo "" # Check if ollama-models directory exists if [ ! -d "ollama-models" ]; then @@ -33,6 +39,19 @@ fi echo "✓ Found ollama-models directory" echo "" +# Check if Ollama binary exists +if [ ! -f "ollama-linux-amd64.tgz" ]; then + echo "ERROR: ollama-linux-amd64.tgz not found!" + echo "" + echo "Please download it first:" + echo " curl -L -o ollama-linux-amd64.tgz https://github.com/ollama/ollama/releases/download/v0.13.1/ollama-linux-amd64.tgz" + echo "" + exit 1 +fi + +echo "✓ Found ollama-linux-amd64.tgz" +echo "" + # Check disk space AVAILABLE_SPACE=$(df -h . | awk 'NR==2 {print $4}') echo "Available disk space: $AVAILABLE_SPACE" @@ -50,7 +69,12 @@ echo "" echo "==========================================" echo "Building Docker image..." echo "==========================================" -docker build -f Dockerfile.allinone -t ${IMAGE_NAME}:${IMAGE_TAG} . +echo "Platform: linux/amd64 (x86_64)" +echo "This may take 20-40 minutes depending on your machine..." +echo "" + +# Build for amd64 platform explicitly +docker build --platform linux/amd64 -f Dockerfile.allinone -t ${IMAGE_NAME}:${IMAGE_TAG} . echo "" echo "==========================================" @@ -83,14 +107,25 @@ echo "2. On target server, load the image:" echo " docker load -i ${EXPORT_FILE}" echo "" echo "3. Run the container:" +echo "" +echo " CPU mode:" echo " docker run -d \\" echo " --name system-prompt-optimizer \\" echo " -p 8010:8010 \\" -echo " -p 11434:11434 \\" -echo " -v \$(pwd)/outputs:/app/outputs \\" echo " --restart unless-stopped \\" echo " ${IMAGE_NAME}:${IMAGE_TAG}" echo "" +echo " GPU mode (recommended if NVIDIA GPU available):" +echo " docker run -d \\" +echo " --name system-prompt-optimizer \\" +echo " --gpus all \\" +echo " -p 8010:8010 \\" +echo " --restart unless-stopped \\" +echo " ${IMAGE_NAME}:${IMAGE_TAG}" +echo "" +echo " Note: Port 11434 (Ollama) is optional and only needed for debugging." +echo " GPU mode provides 5-10x faster inference. See GPU_DEPLOYMENT.md for details." +echo "" echo "4. Access the application:" echo " http://:8010/ui/opro.html" echo "" diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 93141c7..fae03e5 100644 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -2,34 +2,102 @@ set -e +echo "==========================================" +echo "System Prompt Optimizer - Starting Up" +echo "==========================================" +echo "" + +# Check if Ollama binary exists +if ! command -v ollama &> /dev/null; then + echo "ERROR: Ollama binary not found!" + echo "Expected location: /usr/bin/ollama or /usr/local/bin/ollama" + ls -la /usr/bin/ollama* 2>/dev/null || echo "No ollama in /usr/bin/" + ls -la /usr/local/bin/ollama* 2>/dev/null || echo "No ollama in /usr/local/bin/" + exit 1 +fi + +echo "✓ Ollama binary found: $(which ollama)" +echo "" + +# Check if model files exist +echo "Checking model files..." +if [ ! -d "/root/.ollama/models" ]; then + echo "ERROR: /root/.ollama/models directory not found!" + exit 1 +fi + +MANIFEST_COUNT=$(find /root/.ollama/models/manifests -type f 2>/dev/null | wc -l) +BLOB_COUNT=$(find /root/.ollama/models/blobs -type f 2>/dev/null | wc -l) + +echo "✓ Found $MANIFEST_COUNT manifest files" +echo "✓ Found $BLOB_COUNT blob files" + +if [ "$BLOB_COUNT" -lt 10 ]; then + echo "WARNING: Very few blob files found. Models may not be complete." +fi +echo "" + echo "Starting Ollama service..." -ollama serve & +ollama serve > /tmp/ollama.log 2>&1 & +OLLAMA_PID=$! # Wait for Ollama to be ready echo "Waiting for Ollama to start..." -for i in {1..30}; do +OLLAMA_READY=false +for i in {1..60}; do if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then echo "Ollama is ready!" + OLLAMA_READY=true break fi - echo "Waiting for Ollama... ($i/30)" - sleep 2 + echo "Waiting for Ollama... ($i/60)" + sleep 3 done +if [ "$OLLAMA_READY" = false ]; then + echo "" + echo "ERROR: Ollama failed to start within 3 minutes!" + echo "" + echo "Ollama logs:" + cat /tmp/ollama.log + echo "" + echo "Check full logs with: docker logs system-prompt-optimizer" + exit 1 +fi + # Check if models exist, if not, show warning +echo "" echo "Checking for models..." ollama list +echo "" if ! ollama list | grep -q "qwen3:14b"; then - echo "WARNING: qwen3:14b model not found!" + echo "ERROR: qwen3:14b model not found!" echo "The application requires qwen3:14b to function properly." + echo "" + echo "Available models:" + ollama list + echo "" + exit 1 fi if ! ollama list | grep -q "qwen3-embedding"; then echo "WARNING: qwen3-embedding model not found!" echo "The application requires qwen3-embedding:4b for embeddings." + echo "Continuing anyway, but embeddings may not work." fi +echo "" +echo "✓ All required models are available" +echo "" +echo "==========================================" echo "Starting FastAPI application..." +echo "==========================================" +echo "Application will be available at:" +echo " - Web UI: http://localhost:8010/ui/opro.html" +echo " - API Docs: http://localhost:8010/docs" +echo " - Ollama: http://localhost:11434" +echo "" + exec uvicorn _qwen_xinference_demo.api:app --host 0.0.0.0 --port 8010