From 1376d60ed515281507fcb8f8c85922e9e48ea74a Mon Sep 17 00:00:00 2001 From: leehwui Date: Sat, 6 Dec 2025 17:24:28 +0800 Subject: [PATCH] feat: implement true OPRO with Gemini-style UI - Add true OPRO system instruction optimization (vs query rewriting) - Implement iterative optimization with performance trajectory - Add new OPRO API endpoints (/opro/create, /opro/generate_and_evaluate, /opro/execute) - Create modern Gemini-style chat UI (frontend/opro.html) - Optimize performance: reduce candidates from 20 to 10 (2x faster) - Add model selector in UI toolbar - Add collapsible sidebar with session management - Add copy button for instructions - Ensure all generated prompts use simplified Chinese - Update README with comprehensive documentation - Add .gitignore for local_docs folder --- .gitignore | 1 + README.md | 219 +++++++- _qwen_xinference_demo/api.py | 374 ++++++++++++- _qwen_xinference_demo/opro/prompt_utils.py | 110 ++++ _qwen_xinference_demo/opro/session_state.py | 170 ++++++ .../opro/user_prompt_optimizer.py | 98 +++- config.py | 3 +- examples/opro_demo.py | 164 ++++++ frontend/opro.html | 507 ++++++++++++++++++ test_opro_api.py | 184 +++++++ 10 files changed, 1817 insertions(+), 13 deletions(-) create mode 100644 examples/opro_demo.py create mode 100644 frontend/opro.html create mode 100644 test_opro_api.py diff --git a/.gitignore b/.gitignore index e0e1f69..4cc51dc 100644 --- a/.gitignore +++ b/.gitignore @@ -147,6 +147,7 @@ cython_debug/ outputs/ *.jsonl *.log +local_docs/ # Node modules (if any frontend dependencies) node_modules/ diff --git a/README.md b/README.md index ea75157..99c9c8e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,218 @@ +# OPRO Prompt Optimizer + +## 功能概述 + +OPRO (Optimization by PROmpting) 是一个基于大语言模型的提示词优化系统。本项目实现了真正的 OPRO 算法,通过迭代优化系统指令(System Instructions)来提升 LLM 在特定任务上的性能。 + +### 核心功能 + +- **系统指令优化**:使用 LLM 作为优化器,基于历史性能轨迹生成更优的系统指令 +- **多轮迭代优化**:支持多轮优化,每轮基于前一轮的性能反馈生成新的候选指令 +- **智能候选选择**:通过语义聚类和多样性选择,从大量候选中筛选出最具代表性的指令 +- **性能评估**:支持自定义测试用例对系统指令进行自动评估 +- **会话管理**:支持多个优化任务的并行管理和历史记录 + +### 用户界面 + +- **现代化聊天界面**:类似 Google Gemini 的简洁设计 +- **侧边栏会话管理**:可折叠的侧边栏,支持多会话切换 +- **实时优化反馈**:每轮优化生成 3-5 个候选指令,用户可选择继续优化或执行 +- **模型选择**:支持在界面中选择不同的 LLM 模型 + +## 主要优化改进 + +### 1. 真正的 OPRO 实现 + +原始代码实现的是查询重写(Query Rewriting),而非真正的 OPRO。我们添加了完整的 OPRO 功能: + +- **系统指令生成**:`generate_system_instruction_candidates()` - 生成多样化的系统指令候选 +- **性能评估**:`evaluate_system_instruction()` - 基于测试用例评估指令性能 +- **轨迹优化**:基于历史 (instruction, score) 轨迹生成更优指令 +- **元提示工程**:专门设计的元提示用于指导 LLM 生成和优化系统指令 + +### 2. 性能优化 + +- **候选池大小优化**:从 20 个候选减少到 10 个,速度提升约 2 倍 +- **智能聚类选择**:使用 AgglomerativeClustering 从候选池中选择最具多样性的 Top-K +- **嵌入服务回退**:Xinference → Ollama 自动回退机制,确保服务可用性 + +### 3. API 架构改进 + +- **新增 OPRO 端点**: + - `POST /opro/create` - 创建 OPRO 优化任务 + - `POST /opro/generate_and_evaluate` - 生成并自动评估候选 + - `POST /opro/execute` - 执行系统指令 + - `GET /opro/runs` - 获取所有优化任务 + - `GET /opro/run/{run_id}` - 获取特定任务详情 +- **会话状态管理**:完整的 OPRO 运行状态跟踪(轨迹、测试用例、迭代次数) +- **向后兼容**:保留原有查询重写功能,标记为 `opro-legacy` + +### 4. 前端界面重构 + +- **Gemini 风格设计**:简洁的白色/灰色配色,圆角设计,微妙的阴影效果 +- **可折叠侧边栏**:默认折叠,支持会话列表管理 +- **多行输入框**:支持多行文本输入,底部工具栏包含模型选择器 +- **候选指令卡片**:每个候选显示编号、内容、分数,提供"继续优化"、"复制"、"执行"按钮 +- **简体中文界面**:所有 UI 文本和生成的指令均使用简体中文 + +## 快速开始 + +### 环境要求 + +- **Python** ≥ 3.10(推荐使用 conda 虚拟环境) +- **Ollama** 本地服务及模型(如 `qwen3:8b`、`qwen3-embedding:4b`) +- **可选**:Xinference embedding 服务 + +### 安装依赖 + +```bash +# 创建 conda 环境(推荐) +conda create -n opro python=3.10 +conda activate opro + +# 安装 Python 依赖 +pip install fastapi uvicorn requests numpy scikit-learn pydantic +``` + +### 启动 Ollama 服务 + +```bash +# 确保 Ollama 已安装并运行 +ollama serve + +# 拉取所需模型 +ollama pull qwen3:8b +ollama pull qwen3-embedding:4b +``` + +### 启动应用 + +```bash +# 启动后端服务 +uvicorn _qwen_xinference_demo.api:app --host 127.0.0.1 --port 8010 + +# 或使用 0.0.0.0 允许外部访问 +uvicorn _qwen_xinference_demo.api:app --host 0.0.0.0 --port 8010 +``` + +### 访问界面 + +- **OPRO 优化界面**:http://127.0.0.1:8010/ui/opro.html +- **传统三栏界面**:http://127.0.0.1:8010/ui/ +- **API 文档**:http://127.0.0.1:8010/docs +- **OpenAPI JSON**:http://127.0.0.1:8010/openapi.json + +### 使用示例 + +1. **创建新会话**:在 OPRO 界面点击"新建会话"或侧边栏的 + 按钮 +2. **输入任务描述**:例如"将中文翻译成英文" +3. **查看候选指令**:系统生成 3-5 个优化的系统指令 +4. **继续优化**:点击"继续优化"进行下一轮迭代 +5. **执行指令**:点击"执行此指令"测试指令效果 +6. **复制指令**:点击"复制"按钮将指令复制到剪贴板 + +## 配置说明 + +配置文件:`config.py` + +### 关键配置项 + +```python +# Ollama 服务配置 +OLLAMA_HOST = "http://127.0.0.1:11434" +DEFAULT_CHAT_MODEL = "qwen3:8b" +DEFAULT_EMBED_MODEL = "qwen3-embedding:4b" + +# OPRO 优化参数 +GENERATION_POOL_SIZE = 10 # 生成候选池大小 +TOP_K = 5 # 返回给用户的候选数量 +CLUSTER_DISTANCE_THRESHOLD = 0.15 # 聚类距离阈值 + +# Xinference 配置(可选) +XINFERENCE_EMBED_URL = "http://127.0.0.1:9997/models/bge-base-zh/embed" +``` + +## 项目结构 + +``` +. +├── _qwen_xinference_demo/ +│ ├── api.py # FastAPI 主应用 +│ └── opro/ +│ ├── user_prompt_optimizer.py # OPRO 核心逻辑 +│ ├── prompt_utils.py # 元提示生成 +│ ├── session_state.py # 会话状态管理 +│ ├── ollama_client.py # Ollama 客户端 +│ └── xinference_client.py # Xinference 客户端 +├── frontend/ +│ ├── opro.html # OPRO 优化界面 +│ └── index.html # 传统三栏界面 +├── examples/ +│ ├── opro_demo.py # OPRO 功能演示 +│ └── client_demo.py # API 调用示例 +├── config.py # 全局配置 +├── API.md # API 文档 +└── README.md # 本文件 +``` + +## API 端点 + +### OPRO 相关(推荐使用) + +- `POST /opro/create` - 创建优化任务 +- `POST /opro/generate_and_evaluate` - 生成并评估候选 +- `POST /opro/execute` - 执行系统指令 +- `GET /opro/runs` - 获取所有任务 +- `GET /opro/run/{run_id}` - 获取任务详情 + +### 传统端点(向后兼容) + +- `POST /query` - 查询重写(首轮) +- `POST /select` - 选择候选并回答 +- `POST /reject` - 拒绝并重新生成 +- `POST /message` - 聊天消息 + +### 通用端点 + +- `GET /health` - 健康检查 +- `GET /version` - 版本信息 +- `GET /models` - 可用模型列表 +- `POST /set_model` - 设置模型 + +详细 API 文档请访问:http://127.0.0.1:8010/docs + +## 常见问题 + +### 1. 无法连接 Ollama 服务 + +确保 Ollama 服务正在运行: +```bash +ollama serve +``` + +检查配置文件中的 `OLLAMA_HOST` 是否正确。 + +### 2. 模型不可用 + +通过 `/models` 端点查看可用模型列表,使用 `/set_model` 切换模型。 + +### 3. 生成速度慢 + +- 调整 `GENERATION_POOL_SIZE` 减少候选数量 +- 使用更小的模型(如 `qwen3:4b`) +- 确保 Ollama 使用 GPU 加速 + +### 4. 界面显示异常 + +硬刷新浏览器缓存: +- **Mac**: `Cmd + Shift + R` +- **Windows/Linux**: `Ctrl + Shift + R` + +--- + +
+原始 README(点击展开) + - 项目简介 - OPRO Prompt Optimizer:面向提示优化的交互式系统,支持多轮拒选/再生成、语义聚类去重与 Top‑K 代表选择。 @@ -64,4 +279,6 @@ - 模型不可用: /models 查看列表并通过 /set_model 应用;错误返回 MODEL_NOT_AVAILABLE - 第二轮无相关候选:使用 POST /query_from_message 基于最近消息再生候选 _qwen_xinference_demo/api.py:193-206 - 立即回答诉求:用 POST /answer 先答后给候选 _qwen_xinference_demo/api.py:211-219 -- 端口与地址访问差异:在启动命令中明确 --host 0.0.0.0 --port 8010 ,本地浏览器建议访问 127.0.0.1 \ No newline at end of file +- 端口与地址访问差异:在启动命令中明确 --host 0.0.0.0 --port 8010 ,本地浏览器建议访问 127.0.0.1 + +
\ No newline at end of file diff --git a/_qwen_xinference_demo/api.py b/_qwen_xinference_demo/api.py index 5f982d7..4cb1f96 100644 --- a/_qwen_xinference_demo/api.py +++ b/_qwen_xinference_demo/api.py @@ -2,14 +2,30 @@ from fastapi import FastAPI, HTTPException, Request from fastapi.responses import RedirectResponse, FileResponse, JSONResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel +from typing import List, Tuple, Optional import config +# Legacy session management (query rewriting) from .opro.session_state import create_session, get_session, update_session_add_candidates, log_user_choice from .opro.session_state import log_user_reject from .opro.session_state import set_selected_prompt, log_chat_message from .opro.session_state import set_session_model from .opro.session_state import USER_FEEDBACK_LOG + +# True OPRO session management +from .opro.session_state import ( + create_opro_run, get_opro_run, update_opro_iteration, + add_opro_evaluation, get_opro_trajectory, set_opro_test_cases, + complete_opro_run, list_opro_runs +) + +# Optimization functions from .opro.user_prompt_optimizer import generate_candidates +from .opro.user_prompt_optimizer import ( + generate_system_instruction_candidates, + evaluate_system_instruction +) + from .opro.ollama_client import call_qwen from .opro.ollama_client import list_models @@ -23,8 +39,9 @@ app = FastAPI( openapi_tags=[ {"name": "health", "description": "健康检查"}, {"name": "models", "description": "模型列表与设置"}, - {"name": "sessions", "description": "会话管理"}, - {"name": "opro", "description": "提示优化候选生成与选择/拒绝"}, + {"name": "sessions", "description": "会话管理(旧版查询重写)"}, + {"name": "opro-legacy", "description": "旧版提示优化(查询重写)"}, + {"name": "opro-true", "description": "真正的OPRO(系统指令优化)"}, {"name": "chat", "description": "会话聊天"}, {"name": "ui", "description": "静态页面"} ] @@ -89,14 +106,69 @@ class SetModelReq(BaseModel): session_id: str model_name: str -@app.post("/start", tags=["opro"]) + +# ============================================================================ +# TRUE OPRO REQUEST MODELS +# ============================================================================ + +class TestCase(BaseModel): + """A single test case for OPRO evaluation.""" + input: str + expected_output: str + + +class CreateOPRORunReq(BaseModel): + """Request to create a new OPRO optimization run.""" + task_description: str + test_cases: Optional[List[TestCase]] = None + model_name: Optional[str] = None + + +class OPROIterateReq(BaseModel): + """Request to run one OPRO iteration.""" + run_id: str + top_k: Optional[int] = None + + +class OPROEvaluateReq(BaseModel): + """Request to evaluate a system instruction.""" + run_id: str + instruction: str + + +class OPROAddTestCasesReq(BaseModel): + """Request to add test cases to an OPRO run.""" + run_id: str + test_cases: List[TestCase] + + +class OPROGenerateAndEvaluateReq(BaseModel): + """Request to generate and auto-evaluate candidates (for chat-like UX).""" + run_id: str + top_k: Optional[int] = None + pool_size: Optional[int] = None + auto_evaluate: Optional[bool] = True # If False, use diversity-based selection only + + +class OPROExecuteReq(BaseModel): + """Request to execute a system instruction with user input.""" + instruction: str + user_input: str + model_name: Optional[str] = None + + +# ============================================================================ +# LEGACY ENDPOINTS (Query Rewriting - NOT true OPRO) +# ============================================================================ + +@app.post("/start", tags=["opro-legacy"]) def start(req: StartReq): sid = create_session(req.query) cands = generate_candidates(req.query, [], model_name=get_session(sid).get("model_name")) update_session_add_candidates(sid, cands) return ok({"session_id": sid, "round": 0, "candidates": cands}) -@app.post("/next", tags=["opro"]) +@app.post("/next", tags=["opro-legacy"]) def next_round(req: NextReq): s = get_session(req.session_id) if not s: @@ -110,7 +182,7 @@ def next_round(req: NextReq): update_session_add_candidates(req.session_id, cands) return ok({"session_id": req.session_id, "round": s["round"], "candidates": cands}) -@app.post("/select", tags=["opro"]) +@app.post("/select", tags=["opro-legacy"]) def select(req: SelectReq): s = get_session(req.session_id) if not s: @@ -138,7 +210,7 @@ def select(req: SelectReq): pass return ok({"prompt": req.choice, "answer": ans}) -@app.post("/reject", tags=["opro"]) +@app.post("/reject", tags=["opro-legacy"]) def reject(req: RejectReq): s = get_session(req.session_id) if not s: @@ -151,7 +223,7 @@ class QueryReq(BaseModel): query: str session_id: str | None = None -@app.post("/query", tags=["opro"]) +@app.post("/query", tags=["opro-legacy"]) def query(req: QueryReq): if req.session_id: s = get_session(req.session_id) @@ -240,7 +312,7 @@ def message(req: MessageReq): class QueryFromMsgReq(BaseModel): session_id: str -@app.post("/query_from_message", tags=["opro"]) +@app.post("/query_from_message", tags=["opro-legacy"]) def query_from_message(req: QueryFromMsgReq): s = get_session(req.session_id) if not s: @@ -258,7 +330,7 @@ def query_from_message(req: QueryFromMsgReq): class AnswerReq(BaseModel): query: str -@app.post("/answer", tags=["opro"]) +@app.post("/answer", tags=["opro-legacy"]) def answer(req: AnswerReq): sid = create_session(req.query) log_chat_message(sid, "user", req.query) @@ -282,3 +354,287 @@ def set_model(req: SetModelReq): raise AppException(400, f"model not available: {req.model_name}", "MODEL_NOT_AVAILABLE") set_session_model(req.session_id, req.model_name) return ok({"session_id": req.session_id, "model_name": req.model_name}) + + +# ============================================================================ +# TRUE OPRO ENDPOINTS (System Instruction Optimization) +# ============================================================================ + +@app.post("/opro/create", tags=["opro-true"]) +def opro_create_run(req: CreateOPRORunReq): + """ + Create a new OPRO optimization run. + + This starts a new system instruction optimization process for a given task. + """ + # Convert test cases from Pydantic models to tuples + test_cases = None + if req.test_cases: + test_cases = [(tc.input, tc.expected_output) for tc in req.test_cases] + + run_id = create_opro_run( + task_description=req.task_description, + test_cases=test_cases, + model_name=req.model_name + ) + + run = get_opro_run(run_id) + + return ok({ + "run_id": run_id, + "task_description": run["task_description"], + "num_test_cases": len(run["test_cases"]), + "iteration": run["iteration"], + "status": run["status"] + }) + + +@app.post("/opro/iterate", tags=["opro-true"]) +def opro_iterate(req: OPROIterateReq): + """ + Run one OPRO iteration: generate new system instruction candidates. + + This generates optimized system instructions based on the performance trajectory. + """ + run = get_opro_run(req.run_id) + if not run: + raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND") + + # Get trajectory for optimization + trajectory = get_opro_trajectory(req.run_id) + + # Generate candidates + top_k = req.top_k or config.TOP_K + try: + candidates = generate_system_instruction_candidates( + task_description=run["task_description"], + trajectory=trajectory if trajectory else None, + top_k=top_k, + model_name=run["model_name"] + ) + except Exception as e: + raise AppException(500, f"Failed to generate candidates: {e}", "GENERATION_ERROR") + + # Update run with new candidates + update_opro_iteration(req.run_id, candidates) + + return ok({ + "run_id": req.run_id, + "iteration": run["iteration"] + 1, + "candidates": candidates, + "num_candidates": len(candidates), + "best_score": run["best_score"] + }) + + +@app.post("/opro/evaluate", tags=["opro-true"]) +def opro_evaluate(req: OPROEvaluateReq): + """ + Evaluate a system instruction on the test cases. + + This scores the instruction and updates the performance trajectory. + """ + run = get_opro_run(req.run_id) + if not run: + raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND") + + if not run["test_cases"]: + raise AppException(400, "No test cases defined for this run", "NO_TEST_CASES") + + # Evaluate the instruction + try: + score = evaluate_system_instruction( + system_instruction=req.instruction, + test_cases=run["test_cases"], + model_name=run["model_name"] + ) + except Exception as e: + raise AppException(500, f"Evaluation failed: {e}", "EVALUATION_ERROR") + + # Add to trajectory + add_opro_evaluation(req.run_id, req.instruction, score) + + # Get updated run info + run = get_opro_run(req.run_id) + + return ok({ + "run_id": req.run_id, + "instruction": req.instruction, + "score": score, + "best_score": run["best_score"], + "is_new_best": score == run["best_score"] and score > 0 + }) + + +@app.get("/opro/runs", tags=["opro-true"]) +def opro_list_runs(): + """ + List all OPRO optimization runs. + """ + runs = list_opro_runs() + return ok({"runs": runs, "total": len(runs)}) + + +@app.get("/opro/run/{run_id}", tags=["opro-true"]) +def opro_get_run(run_id: str): + """ + Get detailed information about an OPRO run. + """ + run = get_opro_run(run_id) + if not run: + raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND") + + # Get sorted trajectory + trajectory = get_opro_trajectory(run_id) + + return ok({ + "run_id": run_id, + "task_description": run["task_description"], + "iteration": run["iteration"], + "status": run["status"], + "best_score": run["best_score"], + "best_instruction": run["best_instruction"], + "num_test_cases": len(run["test_cases"]), + "test_cases": [{"input": tc[0], "expected_output": tc[1]} for tc in run["test_cases"]], + "trajectory": [{"instruction": inst, "score": score} for inst, score in trajectory[:10]], # Top 10 + "current_candidates": run["current_candidates"] + }) + + +@app.post("/opro/test_cases", tags=["opro-true"]) +def opro_add_test_cases(req: OPROAddTestCasesReq): + """ + Add or update test cases for an OPRO run. + """ + run = get_opro_run(req.run_id) + if not run: + raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND") + + # Convert test cases + test_cases = [(tc.input, tc.expected_output) for tc in req.test_cases] + + # Update test cases + set_opro_test_cases(req.run_id, test_cases) + + return ok({ + "run_id": req.run_id, + "num_test_cases": len(test_cases), + "test_cases": [{"input": tc[0], "expected_output": tc[1]} for tc in test_cases] + }) + + +@app.post("/opro/generate_and_evaluate", tags=["opro-true"]) +def opro_generate_and_evaluate(req: OPROGenerateAndEvaluateReq): + """ + Generate candidates and auto-evaluate them (for chat-like UX). + + This is the main endpoint for the chat interface. It: + 1. Generates candidates based on trajectory + 2. Auto-evaluates them (if test cases exist and auto_evaluate=True) + 3. Returns top-k sorted by score (or diversity if no evaluation) + """ + run = get_opro_run(req.run_id) + if not run: + raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND") + + top_k = req.top_k or config.TOP_K + pool_size = req.pool_size or config.GENERATION_POOL_SIZE + + # Get trajectory for optimization + trajectory = get_opro_trajectory(req.run_id) + + # Generate candidates + try: + candidates = generate_system_instruction_candidates( + task_description=run["task_description"], + trajectory=trajectory if trajectory else None, + top_k=pool_size, # Generate pool_size candidates first + pool_size=pool_size, + model_name=run["model_name"] + ) + except Exception as e: + raise AppException(500, f"Failed to generate candidates: {e}", "GENERATION_ERROR") + + # Decide whether to evaluate + should_evaluate = req.auto_evaluate and len(run["test_cases"]) > 0 + + if should_evaluate: + # Auto-evaluate all candidates + scored_candidates = [] + for candidate in candidates: + try: + score = evaluate_system_instruction( + system_instruction=candidate, + test_cases=run["test_cases"], + model_name=run["model_name"] + ) + scored_candidates.append({"instruction": candidate, "score": score}) + + # Add to trajectory + add_opro_evaluation(req.run_id, candidate, score) + except Exception as e: + # If evaluation fails, assign score 0 + scored_candidates.append({"instruction": candidate, "score": 0.0}) + + # Sort by score (highest first) + scored_candidates.sort(key=lambda x: x["score"], reverse=True) + + # Return top-k + top_candidates = scored_candidates[:top_k] + + # Update iteration + update_opro_iteration(req.run_id, [c["instruction"] for c in top_candidates]) + + return ok({ + "run_id": req.run_id, + "candidates": top_candidates, + "iteration": run["iteration"] + 1, + "evaluated": True, + "best_score": run["best_score"] + }) + else: + # No evaluation - use diversity-based selection (already done by clustering) + # Just return the candidates without scores + top_candidates = [ + {"instruction": candidate, "score": None} + for candidate in candidates[:top_k] + ] + + # Update iteration + update_opro_iteration(req.run_id, [c["instruction"] for c in top_candidates]) + + return ok({ + "run_id": req.run_id, + "candidates": top_candidates, + "iteration": run["iteration"] + 1, + "evaluated": False, + "best_score": run["best_score"] + }) + + +@app.post("/opro/execute", tags=["opro-true"]) +def opro_execute(req: OPROExecuteReq): + """ + Execute a system instruction with user input. + + This uses the selected instruction as a system prompt and calls the LLM. + """ + try: + # Construct full prompt with system instruction + full_prompt = f"{req.instruction}\n\n{req.user_input}" + + # Call LLM + response = call_qwen( + full_prompt, + temperature=0.2, + max_tokens=1024, + model_name=req.model_name + ) + + return ok({ + "instruction": req.instruction, + "user_input": req.user_input, + "response": response + }) + except Exception as e: + raise AppException(500, f"Execution failed: {e}", "EXECUTION_ERROR") diff --git a/_qwen_xinference_demo/opro/prompt_utils.py b/_qwen_xinference_demo/opro/prompt_utils.py index 4d44486..1fdff2f 100644 --- a/_qwen_xinference_demo/opro/prompt_utils.py +++ b/_qwen_xinference_demo/opro/prompt_utils.py @@ -1,4 +1,14 @@ +from typing import List, Tuple + +# ============================================================================ +# OLD FUNCTIONS (Query Rewriting - NOT true OPRO, kept for compatibility) +# ============================================================================ + def refine_instruction(query: str) -> str: + """ + LEGACY: Generates query rewrites (NOT true OPRO). + This is query expansion, not system instruction optimization. + """ return f""" 你是一个“问题澄清与重写助手”。 请根据用户的原始问题: @@ -7,6 +17,9 @@ def refine_instruction(query: str) -> str: """ def refine_instruction_with_history(query: str, rejected_list: list) -> str: + """ + LEGACY: Generates query rewrites with rejection history (NOT true OPRO). + """ rejected_text = "\n".join(f"- {r}" for r in rejected_list) if rejected_list else "" return f""" 你是一个“问题澄清与重写助手”。 @@ -18,3 +31,100 @@ def refine_instruction_with_history(query: str, rejected_list: list) -> str: 请从新的角度重新生成至少20条不同的改写问题,每条单独一行。 """ + + +# ============================================================================ +# TRUE OPRO FUNCTIONS (System Instruction Optimization) +# ============================================================================ + +def generate_initial_system_instruction_candidates(task_description: str, pool_size: int = None) -> str: + """ + TRUE OPRO: Generates initial candidate System Instructions for a new OPRO run. + + Args: + task_description: Description of the task the LLM should perform + pool_size: Number of candidates to generate (defaults to config.GENERATION_POOL_SIZE) + + Returns: + Meta-prompt that instructs the optimizer LLM to generate system instruction candidates + """ + import config + pool_size = pool_size or config.GENERATION_POOL_SIZE + + return f""" +你是一个"系统指令生成助手"。 +目标任务描述: +【{task_description}】 + +请根据以上任务,生成 {pool_size} 条高质量、风格各异的"System Instruction"候选指令。 + +要求: +1. 每条指令必须有明显不同的风格和侧重点 +2. 覆盖不同的实现策略(例如:简洁型、详细型、示例型、角色扮演型、步骤型等) +3. 这些指令应指导LLM的行为和输出格式,以最大化任务性能 +4. 每条指令单独成行,不包含编号或额外说明 +5. 所有生成的指令必须使用简体中文 + +生成 {pool_size} 条指令: +""" + + +def generate_optimized_system_instruction( + task_description: str, + trajectory: List[Tuple[str, float]], + pool_size: int = None +) -> str: + """ + TRUE OPRO: Analyzes performance trajectory and generates optimized System Instructions. + + This is the core OPRO function that uses an LLM as an optimizer to improve + system instructions based on historical performance scores. + + Args: + task_description: Description of the task the LLM should perform + trajectory: List of (instruction, score) tuples, sorted by score (highest first) + pool_size: Number of candidates to generate (defaults to config.GENERATION_POOL_SIZE) + + Returns: + Meta-prompt that instructs the optimizer LLM to generate better system instructions + """ + import config + pool_size = pool_size or config.GENERATION_POOL_SIZE + + if not trajectory: + # If no trajectory, fall back to initial generation + return generate_initial_system_instruction_candidates(task_description, pool_size) + + # Format the trajectory for the Optimizer LLM + formatted_history = "\n".join( + f"--- Instruction Score: {score:.4f}\n{instruction}" + for instruction, score in trajectory + ) + + # Determine the current highest score to set the optimization goal + highest_score = max(score for _, score in trajectory) + + # Construct the Meta-Prompt (The OPRO Instruction) + return f""" +你是一个"System Prompt 优化器"。 +你的任务是改进一个LLM的系统指令,以最大化其在以下任务中的性能: +【{task_description}】 + +--- +**历史性能轨迹 (Instructions and Scores):** +{formatted_history} +--- +**当前最高得分: {highest_score:.4f}** + +请分析得分最高的指令的特点和得分最低指令的缺陷。 +然后,生成 {pool_size} 条新的、有潜力超越 {highest_score:.4f} 分的System Instruction。 + +要求: +1. 每条指令必须有明显不同的改进策略 +2. 结合高分指令的优点,避免低分指令的缺陷 +3. 探索新的优化方向和表达方式 +4. 每条指令单独成行,不包含编号或额外说明 +5. 所有生成的指令必须使用简体中文 + +生成 {pool_size} 条优化后的指令: +""" diff --git a/_qwen_xinference_demo/opro/session_state.py b/_qwen_xinference_demo/opro/session_state.py index 96c7c77..5ff87b7 100644 --- a/_qwen_xinference_demo/opro/session_state.py +++ b/_qwen_xinference_demo/opro/session_state.py @@ -1,8 +1,14 @@ import uuid +from typing import List, Tuple, Dict, Any +# Legacy session storage (for query rewriting) SESSIONS = {} USER_FEEDBACK_LOG = [] +# OPRO session storage (for system instruction optimization) +OPRO_RUNS = {} +OPRO_RUN_LOG = [] + def create_session(query: str) -> str: sid = uuid.uuid4().hex SESSIONS[sid] = { @@ -54,3 +60,167 @@ def set_session_model(sid: str, model_name: str | None): s = SESSIONS.get(sid) if s is not None: s["model_name"] = model_name + + +# ============================================================================ +# TRUE OPRO SESSION MANAGEMENT +# ============================================================================ + +def create_opro_run( + task_description: str, + test_cases: List[Tuple[str, str]] = None, + model_name: str = None +) -> str: + """ + Create a new OPRO optimization run. + + Args: + task_description: Description of the task to optimize for + test_cases: List of (input, expected_output) tuples for evaluation + model_name: Optional model name to use + + Returns: + run_id: Unique identifier for this OPRO run + """ + run_id = uuid.uuid4().hex + OPRO_RUNS[run_id] = { + "task_description": task_description, + "test_cases": test_cases or [], + "model_name": model_name, + "iteration": 0, + "trajectory": [], # List of (instruction, score) tuples + "best_instruction": None, + "best_score": 0.0, + "current_candidates": [], + "created_at": uuid.uuid1().time, + "status": "active" # active, completed, failed + } + return run_id + + +def get_opro_run(run_id: str) -> Dict[str, Any]: + """Get OPRO run by ID.""" + return OPRO_RUNS.get(run_id) + + +def update_opro_iteration( + run_id: str, + candidates: List[str], + scores: List[float] = None +): + """ + Update OPRO run with new iteration results. + + Args: + run_id: OPRO run identifier + candidates: List of system instruction candidates + scores: Optional list of scores (if evaluated) + """ + run = OPRO_RUNS.get(run_id) + if not run: + return + + run["iteration"] += 1 + run["current_candidates"] = candidates + + # If scores provided, update trajectory + if scores and len(scores) == len(candidates): + for candidate, score in zip(candidates, scores): + run["trajectory"].append((candidate, score)) + + # Update best if this is better + if score > run["best_score"]: + run["best_score"] = score + run["best_instruction"] = candidate + + # Log the iteration + OPRO_RUN_LOG.append({ + "run_id": run_id, + "iteration": run["iteration"], + "num_candidates": len(candidates), + "best_score": run["best_score"] + }) + + +def add_opro_evaluation( + run_id: str, + instruction: str, + score: float +): + """ + Add a single evaluation result to OPRO run. + + Args: + run_id: OPRO run identifier + instruction: System instruction that was evaluated + score: Performance score + """ + run = OPRO_RUNS.get(run_id) + if not run: + return + + # Add to trajectory + run["trajectory"].append((instruction, score)) + + # Update best if this is better + if score > run["best_score"]: + run["best_score"] = score + run["best_instruction"] = instruction + + +def get_opro_trajectory(run_id: str) -> List[Tuple[str, float]]: + """ + Get the performance trajectory for an OPRO run. + + Returns: + List of (instruction, score) tuples sorted by score (highest first) + """ + run = OPRO_RUNS.get(run_id) + if not run: + return [] + + trajectory = run["trajectory"] + return sorted(trajectory, key=lambda x: x[1], reverse=True) + + +def set_opro_test_cases( + run_id: str, + test_cases: List[Tuple[str, str]] +): + """ + Set or update test cases for an OPRO run. + + Args: + run_id: OPRO run identifier + test_cases: List of (input, expected_output) tuples + """ + run = OPRO_RUNS.get(run_id) + if run: + run["test_cases"] = test_cases + + +def complete_opro_run(run_id: str): + """Mark an OPRO run as completed.""" + run = OPRO_RUNS.get(run_id) + if run: + run["status"] = "completed" + + +def list_opro_runs() -> List[Dict[str, Any]]: + """ + List all OPRO runs with summary information. + + Returns: + List of run summaries + """ + return [ + { + "run_id": run_id, + "task_description": run["task_description"][:100] + "..." if len(run["task_description"]) > 100 else run["task_description"], + "iteration": run["iteration"], + "best_score": run["best_score"], + "num_test_cases": len(run["test_cases"]), + "status": run["status"] + } + for run_id, run in OPRO_RUNS.items() + ] diff --git a/_qwen_xinference_demo/opro/user_prompt_optimizer.py b/_qwen_xinference_demo/opro/user_prompt_optimizer.py index d742e8f..be4c464 100644 --- a/_qwen_xinference_demo/opro/user_prompt_optimizer.py +++ b/_qwen_xinference_demo/opro/user_prompt_optimizer.py @@ -1,12 +1,18 @@ import re import numpy as np +from typing import List, Tuple from sklearn.cluster import AgglomerativeClustering from sklearn.metrics.pairwise import cosine_similarity import config from .ollama_client import call_qwen from .xinference_client import embed_texts -from .prompt_utils import refine_instruction, refine_instruction_with_history +from .prompt_utils import ( + refine_instruction, + refine_instruction_with_history, + generate_initial_system_instruction_candidates, + generate_optimized_system_instruction +) def parse_candidates(raw: str) -> list: lines = [l.strip() for l in re.split(r'\r?\n', raw) if l.strip()] @@ -33,7 +39,7 @@ def cluster_and_select(candidates: list, top_k=config.TOP_K, distance_threshold= linkage="average") labels = clustering.fit_predict(X) - selected_idx = [] + selected_idx = [] for label in sorted(set(labels)): idxs = [i for i,l in enumerate(labels) if l == label] sims = cosine_similarity(X[idxs]).mean(axis=1) @@ -44,6 +50,10 @@ def cluster_and_select(candidates: list, top_k=config.TOP_K, distance_threshold= return selected[:top_k] def generate_candidates(query: str, rejected=None, top_k=config.TOP_K, model_name=None): + """ + LEGACY: Query rewriting function (NOT true OPRO). + Kept for backward compatibility with existing API endpoints. + """ rejected = rejected or [] if rejected: prompt = refine_instruction_with_history(query, rejected) @@ -53,3 +63,87 @@ def generate_candidates(query: str, rejected=None, top_k=config.TOP_K, model_nam raw = call_qwen(prompt, temperature=0.9, max_tokens=1024, model_name=model_name) all_candidates = parse_candidates(raw) return cluster_and_select(all_candidates, top_k=top_k) + + +# ============================================================================ +# TRUE OPRO FUNCTIONS (System Instruction Optimization) +# ============================================================================ + +def generate_system_instruction_candidates( + task_description: str, + trajectory: List[Tuple[str, float]] = None, + top_k: int = config.TOP_K, + pool_size: int = None, + model_name: str = None +) -> List[str]: + """ + TRUE OPRO: Generates optimized system instruction candidates. + + This is the core OPRO function that generates system instructions based on + performance trajectory (if available) or initial candidates (if starting fresh). + + Args: + task_description: Description of the task the LLM should perform + trajectory: Optional list of (instruction, score) tuples from previous iterations + top_k: Number of diverse candidates to return (default: config.TOP_K = 5) + pool_size: Number of candidates to generate before clustering (default: config.GENERATION_POOL_SIZE = 10) + model_name: Optional model name to use for generation + + Returns: + List of top-k diverse system instruction candidates + """ + pool_size = pool_size or config.GENERATION_POOL_SIZE + + # Generate the meta-prompt based on whether we have trajectory data + if trajectory and len(trajectory) > 0: + # Sort trajectory by score (highest first) + sorted_trajectory = sorted(trajectory, key=lambda x: x[1], reverse=True) + meta_prompt = generate_optimized_system_instruction(task_description, sorted_trajectory, pool_size) + else: + # No trajectory yet, generate initial candidates + meta_prompt = generate_initial_system_instruction_candidates(task_description, pool_size) + + # Use the optimizer LLM to generate candidates + raw = call_qwen(meta_prompt, temperature=0.9, max_tokens=1024, model_name=model_name) + + # Parse the generated candidates + all_candidates = parse_candidates(raw) + + # Cluster and select diverse representatives + return cluster_and_select(all_candidates, top_k=top_k) + + +def evaluate_system_instruction( + system_instruction: str, + test_cases: List[Tuple[str, str]], + model_name: str = None +) -> float: + """ + TRUE OPRO: Evaluates a system instruction's performance on test cases. + + Args: + system_instruction: The system instruction to evaluate + test_cases: List of (input, expected_output) tuples + model_name: Optional model name to use for evaluation + + Returns: + Performance score (0.0 to 1.0) + """ + if not test_cases: + return 0.0 + + correct = 0 + total = len(test_cases) + + for input_text, expected_output in test_cases: + # Construct the full prompt with system instruction + full_prompt = f"{system_instruction}\n\n{input_text}" + + # Get LLM response + response = call_qwen(full_prompt, temperature=0.2, max_tokens=512, model_name=model_name) + + # Simple exact match scoring (can be replaced with more sophisticated metrics) + if expected_output.strip().lower() in response.strip().lower(): + correct += 1 + + return correct / total diff --git a/config.py b/config.py index 2662c83..00fa928 100644 --- a/config.py +++ b/config.py @@ -14,6 +14,7 @@ DEFAULT_EMBED_MODEL = "qwen3-embedding:4b" XINFERENCE_EMBED_URL = "http://127.0.0.1:9997/models/bge-base-zh/embed" # Clustering/selection -TOP_K = 5 +GENERATION_POOL_SIZE = 10 # Generate this many candidates before clustering +TOP_K = 5 # Return this many diverse candidates to user CLUSTER_DISTANCE_THRESHOLD = 0.15 diff --git a/examples/opro_demo.py b/examples/opro_demo.py new file mode 100644 index 0000000..bb89ef9 --- /dev/null +++ b/examples/opro_demo.py @@ -0,0 +1,164 @@ +""" +TRUE OPRO Demo Script + +This script demonstrates the true OPRO (Optimization by PROmpting) functionality. +It shows how to: +1. Generate initial system instruction candidates +2. Evaluate them on test cases +3. Use the performance trajectory to generate better candidates +""" + +import sys +sys.path.insert(0, '.') + +from _qwen_xinference_demo.opro.user_prompt_optimizer import ( + generate_system_instruction_candidates, + evaluate_system_instruction +) +import config + + +def demo_opro_workflow(): + """ + Demonstrates a complete OPRO optimization workflow. + """ + print("=" * 80) + print("TRUE OPRO Demo - System Instruction Optimization") + print("=" * 80) + print(f"Pool Size: {config.GENERATION_POOL_SIZE} candidates → Clustered to Top {config.TOP_K}") + + # Define the task + task_description = """ +任务:将用户输入的中文句子翻译成英文。 +要求:翻译准确、自然、符合英语表达习惯。 +""" + + print(f"\n📋 Task Description:\n{task_description}") + + # Define test cases for evaluation + test_cases = [ + ("你好,很高兴见到你", "Hello, nice to meet you"), + ("今天天气真好", "The weather is really nice today"), + ("我喜欢学习编程", "I like learning programming"), + ("这本书很有趣", "This book is very interesting"), + ] + + print(f"\n🧪 Test Cases: {len(test_cases)} examples") + for i, (input_text, expected) in enumerate(test_cases, 1): + print(f" {i}. '{input_text}' → '{expected}'") + + # Iteration 1: Generate initial candidates + print("\n" + "=" * 80) + print("🔄 Iteration 1: Generating Initial System Instruction Candidates") + print("=" * 80) + + print("\n⏳ Generating candidates... (this may take a moment)") + candidates_round1 = generate_system_instruction_candidates( + task_description=task_description, + trajectory=None, # No history yet + top_k=3, + model_name=None # Use default model + ) + + print(f"\n✅ Generated {len(candidates_round1)} candidates:") + for i, candidate in enumerate(candidates_round1, 1): + print(f"\n Candidate {i}:") + print(f" {candidate[:100]}..." if len(candidate) > 100 else f" {candidate}") + + # Evaluate each candidate + print("\n" + "-" * 80) + print("📊 Evaluating Candidates on Test Cases") + print("-" * 80) + + trajectory = [] + for i, candidate in enumerate(candidates_round1, 1): + print(f"\n⏳ Evaluating Candidate {i}...") + score = evaluate_system_instruction( + system_instruction=candidate, + test_cases=test_cases, + model_name=None + ) + trajectory.append((candidate, score)) + print(f" Score: {score:.2%}") + + # Sort by score + trajectory.sort(key=lambda x: x[1], reverse=True) + + print("\n📈 Performance Summary (Round 1):") + for i, (candidate, score) in enumerate(trajectory, 1): + print(f" {i}. Score: {score:.2%} - {candidate[:60]}...") + + best_score = trajectory[0][1] + print(f"\n🏆 Best Score: {best_score:.2%}") + + # Iteration 2: Generate optimized candidates based on trajectory + print("\n" + "=" * 80) + print("🔄 Iteration 2: Generating Optimized System Instructions") + print("=" * 80) + print(f"\n💡 Using performance trajectory to generate better candidates...") + print(f" Goal: Beat current best score of {best_score:.2%}") + + print("\n⏳ Generating optimized candidates...") + candidates_round2 = generate_system_instruction_candidates( + task_description=task_description, + trajectory=trajectory, # Use performance history + top_k=3, + model_name=None + ) + + print(f"\n✅ Generated {len(candidates_round2)} optimized candidates:") + for i, candidate in enumerate(candidates_round2, 1): + print(f"\n Candidate {i}:") + print(f" {candidate[:100]}..." if len(candidate) > 100 else f" {candidate}") + + # Evaluate new candidates + print("\n" + "-" * 80) + print("📊 Evaluating Optimized Candidates") + print("-" * 80) + + for i, candidate in enumerate(candidates_round2, 1): + print(f"\n⏳ Evaluating Optimized Candidate {i}...") + score = evaluate_system_instruction( + system_instruction=candidate, + test_cases=test_cases, + model_name=None + ) + trajectory.append((candidate, score)) + print(f" Score: {score:.2%}") + if score > best_score: + print(f" 🎉 NEW BEST! Improved from {best_score:.2%} to {score:.2%}") + best_score = score + + # Final summary + trajectory.sort(key=lambda x: x[1], reverse=True) + + print("\n" + "=" * 80) + print("🏁 Final Results") + print("=" * 80) + print(f"\n🏆 Best System Instruction (Score: {trajectory[0][1]:.2%}):") + print(f"\n{trajectory[0][0]}") + + print("\n📊 All Candidates Ranked:") + for i, (candidate, score) in enumerate(trajectory[:5], 1): + print(f"\n {i}. Score: {score:.2%}") + print(f" {candidate[:80]}...") + + print("\n" + "=" * 80) + print("✅ OPRO Demo Complete!") + print("=" * 80) + + +if __name__ == "__main__": + print("\n⚠️ NOTE: This demo requires:") + print(" 1. Ollama running locally (http://127.0.0.1:11434)") + print(" 2. A Qwen model available (e.g., qwen3:8b)") + print(" 3. An embedding model (e.g., qwen3-embedding:4b)") + print("\n Press Ctrl+C to cancel, or Enter to continue...") + + try: + input() + demo_opro_workflow() + except KeyboardInterrupt: + print("\n\n❌ Demo cancelled by user.") + sys.exit(0) + diff --git a/frontend/opro.html b/frontend/opro.html new file mode 100644 index 0000000..ddf03cc --- /dev/null +++ b/frontend/opro.html @@ -0,0 +1,507 @@ + + + + + + + + + OPRO - System Instruction Optimizer + + + + + + +
+ + + + + diff --git a/test_opro_api.py b/test_opro_api.py new file mode 100644 index 0000000..25b9afe --- /dev/null +++ b/test_opro_api.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Test script for TRUE OPRO API endpoints. + +This script tests the complete OPRO workflow: +1. Create OPRO run +2. Generate initial candidates +3. Evaluate candidates +4. Generate optimized candidates +5. View results + +Usage: + python test_opro_api.py +""" + +import requests +import json +import time + +BASE_URL = "http://127.0.0.1:8010" + +def print_section(title): + """Print a section header.""" + print("\n" + "=" * 60) + print(f" {title}") + print("=" * 60) + +def test_opro_workflow(): + """Test the complete OPRO workflow.""" + + print_section("1. Create OPRO Run") + + # Create a new OPRO run + create_req = { + "task_description": "将用户输入的中文翻译成英文,要求准确自然", + "test_cases": [ + {"input": "你好", "expected_output": "Hello"}, + {"input": "谢谢", "expected_output": "Thank you"}, + {"input": "早上好", "expected_output": "Good morning"}, + {"input": "晚安", "expected_output": "Good night"}, + {"input": "再见", "expected_output": "Goodbye"} + ] + } + + response = requests.post(f"{BASE_URL}/opro/create", json=create_req) + result = response.json() + + if not result.get("success"): + print(f"❌ Failed to create OPRO run: {result}") + return + + run_id = result["data"]["run_id"] + print(f"✅ Created OPRO run: {run_id}") + print(f" Task: {result['data']['task_description']}") + print(f" Test cases: {result['data']['num_test_cases']}") + + # ======================================================================== + print_section("2. Generate Initial Candidates") + + iterate_req = {"run_id": run_id, "top_k": 5} + response = requests.post(f"{BASE_URL}/opro/iterate", json=iterate_req) + result = response.json() + + if not result.get("success"): + print(f"❌ Failed to generate candidates: {result}") + return + + candidates = result["data"]["candidates"] + print(f"✅ Generated {len(candidates)} initial candidates:") + for i, candidate in enumerate(candidates, 1): + print(f"\n [{i}] {candidate[:100]}...") + + # ======================================================================== + print_section("3. Evaluate Candidates") + + scores = [] + for i, candidate in enumerate(candidates, 1): + print(f"\n Evaluating candidate {i}/{len(candidates)}...") + + eval_req = { + "run_id": run_id, + "instruction": candidate + } + + response = requests.post(f"{BASE_URL}/opro/evaluate", json=eval_req) + result = response.json() + + if result.get("success"): + score = result["data"]["score"] + scores.append(score) + is_best = "🏆" if result["data"]["is_new_best"] else "" + print(f" ✅ Score: {score:.4f} {is_best}") + else: + print(f" ❌ Evaluation failed: {result}") + + time.sleep(0.5) # Small delay to avoid overwhelming the API + + print(f"\n Average score: {sum(scores)/len(scores):.4f}") + print(f" Best score: {max(scores):.4f}") + + # ======================================================================== + print_section("4. Generate Optimized Candidates (Iteration 2)") + + print(" Generating candidates based on performance trajectory...") + + iterate_req = {"run_id": run_id, "top_k": 5} + response = requests.post(f"{BASE_URL}/opro/iterate", json=iterate_req) + result = response.json() + + if not result.get("success"): + print(f"❌ Failed to generate optimized candidates: {result}") + return + + optimized_candidates = result["data"]["candidates"] + print(f"✅ Generated {len(optimized_candidates)} optimized candidates:") + for i, candidate in enumerate(optimized_candidates, 1): + print(f"\n [{i}] {candidate[:100]}...") + + # ======================================================================== + print_section("5. View Run Details") + + response = requests.get(f"{BASE_URL}/opro/run/{run_id}") + result = response.json() + + if not result.get("success"): + print(f"❌ Failed to get run details: {result}") + return + + data = result["data"] + print(f"✅ OPRO Run Details:") + print(f" Run ID: {data['run_id']}") + print(f" Task: {data['task_description']}") + print(f" Iteration: {data['iteration']}") + print(f" Status: {data['status']}") + print(f" Best Score: {data['best_score']:.4f}") + print(f"\n Best Instruction:") + print(f" {data['best_instruction'][:200]}...") + + print(f"\n Top 5 Trajectory:") + for i, item in enumerate(data['trajectory'][:5], 1): + print(f" [{i}] Score: {item['score']:.4f}") + print(f" {item['instruction'][:80]}...") + + # ======================================================================== + print_section("6. List All Runs") + + response = requests.get(f"{BASE_URL}/opro/runs") + result = response.json() + + if result.get("success"): + runs = result["data"]["runs"] + print(f"✅ Total OPRO runs: {result['data']['total']}") + for run in runs: + print(f"\n Run: {run['run_id']}") + print(f" Task: {run['task_description'][:50]}...") + print(f" Iteration: {run['iteration']}, Best Score: {run['best_score']:.4f}") + + print_section("✅ OPRO Workflow Test Complete!") + print(f"\nRun ID: {run_id}") + print("You can view details at:") + print(f" {BASE_URL}/opro/run/{run_id}") + + +if __name__ == "__main__": + print("=" * 60) + print(" TRUE OPRO API Test") + print("=" * 60) + print(f"\nBase URL: {BASE_URL}") + print("\nMake sure the API server is running:") + print(" uvicorn _qwen_xinference_demo.api:app --host 127.0.0.1 --port 8010") + print("\nStarting test in 3 seconds...") + time.sleep(3) + + try: + test_opro_workflow() + except requests.exceptions.ConnectionError: + print("\n❌ ERROR: Could not connect to API server") + print("Please start the server first:") + print(" uvicorn _qwen_xinference_demo.api:app --host 127.0.0.1 --port 8010") + except Exception as e: + print(f"\n❌ ERROR: {e}") + import traceback + traceback.print_exc() +