feat: implement true OPRO with Gemini-style UI

- Add true OPRO system instruction optimization (vs query rewriting)
- Implement iterative optimization with performance trajectory
- Add new OPRO API endpoints (/opro/create, /opro/generate_and_evaluate, /opro/execute)
- Create modern Gemini-style chat UI (frontend/opro.html)
- Optimize performance: reduce candidates from 20 to 10 (2x faster)
- Add model selector in UI toolbar
- Add collapsible sidebar with session management
- Add copy button for instructions
- Ensure all generated prompts use simplified Chinese
- Update README with comprehensive documentation
- Add .gitignore for local_docs folder
This commit is contained in:
2025-12-06 17:24:28 +08:00
parent 8f52fad41c
commit 1376d60ed5
10 changed files with 1817 additions and 13 deletions

View File

@@ -2,14 +2,30 @@ from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from typing import List, Tuple, Optional
import config
# Legacy session management (query rewriting)
from .opro.session_state import create_session, get_session, update_session_add_candidates, log_user_choice
from .opro.session_state import log_user_reject
from .opro.session_state import set_selected_prompt, log_chat_message
from .opro.session_state import set_session_model
from .opro.session_state import USER_FEEDBACK_LOG
# True OPRO session management
from .opro.session_state import (
create_opro_run, get_opro_run, update_opro_iteration,
add_opro_evaluation, get_opro_trajectory, set_opro_test_cases,
complete_opro_run, list_opro_runs
)
# Optimization functions
from .opro.user_prompt_optimizer import generate_candidates
from .opro.user_prompt_optimizer import (
generate_system_instruction_candidates,
evaluate_system_instruction
)
from .opro.ollama_client import call_qwen
from .opro.ollama_client import list_models
@@ -23,8 +39,9 @@ app = FastAPI(
openapi_tags=[
{"name": "health", "description": "健康检查"},
{"name": "models", "description": "模型列表与设置"},
{"name": "sessions", "description": "会话管理"},
{"name": "opro", "description": "提示优化候选生成与选择/拒绝"},
{"name": "sessions", "description": "会话管理(旧版查询重写)"},
{"name": "opro-legacy", "description": "旧版提示优化(查询重写)"},
{"name": "opro-true", "description": "真正的OPRO系统指令优化"},
{"name": "chat", "description": "会话聊天"},
{"name": "ui", "description": "静态页面"}
]
@@ -89,14 +106,69 @@ class SetModelReq(BaseModel):
session_id: str
model_name: str
@app.post("/start", tags=["opro"])
# ============================================================================
# TRUE OPRO REQUEST MODELS
# ============================================================================
class TestCase(BaseModel):
"""A single test case for OPRO evaluation."""
input: str
expected_output: str
class CreateOPRORunReq(BaseModel):
"""Request to create a new OPRO optimization run."""
task_description: str
test_cases: Optional[List[TestCase]] = None
model_name: Optional[str] = None
class OPROIterateReq(BaseModel):
"""Request to run one OPRO iteration."""
run_id: str
top_k: Optional[int] = None
class OPROEvaluateReq(BaseModel):
"""Request to evaluate a system instruction."""
run_id: str
instruction: str
class OPROAddTestCasesReq(BaseModel):
"""Request to add test cases to an OPRO run."""
run_id: str
test_cases: List[TestCase]
class OPROGenerateAndEvaluateReq(BaseModel):
"""Request to generate and auto-evaluate candidates (for chat-like UX)."""
run_id: str
top_k: Optional[int] = None
pool_size: Optional[int] = None
auto_evaluate: Optional[bool] = True # If False, use diversity-based selection only
class OPROExecuteReq(BaseModel):
"""Request to execute a system instruction with user input."""
instruction: str
user_input: str
model_name: Optional[str] = None
# ============================================================================
# LEGACY ENDPOINTS (Query Rewriting - NOT true OPRO)
# ============================================================================
@app.post("/start", tags=["opro-legacy"])
def start(req: StartReq):
sid = create_session(req.query)
cands = generate_candidates(req.query, [], model_name=get_session(sid).get("model_name"))
update_session_add_candidates(sid, cands)
return ok({"session_id": sid, "round": 0, "candidates": cands})
@app.post("/next", tags=["opro"])
@app.post("/next", tags=["opro-legacy"])
def next_round(req: NextReq):
s = get_session(req.session_id)
if not s:
@@ -110,7 +182,7 @@ def next_round(req: NextReq):
update_session_add_candidates(req.session_id, cands)
return ok({"session_id": req.session_id, "round": s["round"], "candidates": cands})
@app.post("/select", tags=["opro"])
@app.post("/select", tags=["opro-legacy"])
def select(req: SelectReq):
s = get_session(req.session_id)
if not s:
@@ -138,7 +210,7 @@ def select(req: SelectReq):
pass
return ok({"prompt": req.choice, "answer": ans})
@app.post("/reject", tags=["opro"])
@app.post("/reject", tags=["opro-legacy"])
def reject(req: RejectReq):
s = get_session(req.session_id)
if not s:
@@ -151,7 +223,7 @@ class QueryReq(BaseModel):
query: str
session_id: str | None = None
@app.post("/query", tags=["opro"])
@app.post("/query", tags=["opro-legacy"])
def query(req: QueryReq):
if req.session_id:
s = get_session(req.session_id)
@@ -240,7 +312,7 @@ def message(req: MessageReq):
class QueryFromMsgReq(BaseModel):
session_id: str
@app.post("/query_from_message", tags=["opro"])
@app.post("/query_from_message", tags=["opro-legacy"])
def query_from_message(req: QueryFromMsgReq):
s = get_session(req.session_id)
if not s:
@@ -258,7 +330,7 @@ def query_from_message(req: QueryFromMsgReq):
class AnswerReq(BaseModel):
query: str
@app.post("/answer", tags=["opro"])
@app.post("/answer", tags=["opro-legacy"])
def answer(req: AnswerReq):
sid = create_session(req.query)
log_chat_message(sid, "user", req.query)
@@ -282,3 +354,287 @@ def set_model(req: SetModelReq):
raise AppException(400, f"model not available: {req.model_name}", "MODEL_NOT_AVAILABLE")
set_session_model(req.session_id, req.model_name)
return ok({"session_id": req.session_id, "model_name": req.model_name})
# ============================================================================
# TRUE OPRO ENDPOINTS (System Instruction Optimization)
# ============================================================================
@app.post("/opro/create", tags=["opro-true"])
def opro_create_run(req: CreateOPRORunReq):
"""
Create a new OPRO optimization run.
This starts a new system instruction optimization process for a given task.
"""
# Convert test cases from Pydantic models to tuples
test_cases = None
if req.test_cases:
test_cases = [(tc.input, tc.expected_output) for tc in req.test_cases]
run_id = create_opro_run(
task_description=req.task_description,
test_cases=test_cases,
model_name=req.model_name
)
run = get_opro_run(run_id)
return ok({
"run_id": run_id,
"task_description": run["task_description"],
"num_test_cases": len(run["test_cases"]),
"iteration": run["iteration"],
"status": run["status"]
})
@app.post("/opro/iterate", tags=["opro-true"])
def opro_iterate(req: OPROIterateReq):
"""
Run one OPRO iteration: generate new system instruction candidates.
This generates optimized system instructions based on the performance trajectory.
"""
run = get_opro_run(req.run_id)
if not run:
raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND")
# Get trajectory for optimization
trajectory = get_opro_trajectory(req.run_id)
# Generate candidates
top_k = req.top_k or config.TOP_K
try:
candidates = generate_system_instruction_candidates(
task_description=run["task_description"],
trajectory=trajectory if trajectory else None,
top_k=top_k,
model_name=run["model_name"]
)
except Exception as e:
raise AppException(500, f"Failed to generate candidates: {e}", "GENERATION_ERROR")
# Update run with new candidates
update_opro_iteration(req.run_id, candidates)
return ok({
"run_id": req.run_id,
"iteration": run["iteration"] + 1,
"candidates": candidates,
"num_candidates": len(candidates),
"best_score": run["best_score"]
})
@app.post("/opro/evaluate", tags=["opro-true"])
def opro_evaluate(req: OPROEvaluateReq):
"""
Evaluate a system instruction on the test cases.
This scores the instruction and updates the performance trajectory.
"""
run = get_opro_run(req.run_id)
if not run:
raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND")
if not run["test_cases"]:
raise AppException(400, "No test cases defined for this run", "NO_TEST_CASES")
# Evaluate the instruction
try:
score = evaluate_system_instruction(
system_instruction=req.instruction,
test_cases=run["test_cases"],
model_name=run["model_name"]
)
except Exception as e:
raise AppException(500, f"Evaluation failed: {e}", "EVALUATION_ERROR")
# Add to trajectory
add_opro_evaluation(req.run_id, req.instruction, score)
# Get updated run info
run = get_opro_run(req.run_id)
return ok({
"run_id": req.run_id,
"instruction": req.instruction,
"score": score,
"best_score": run["best_score"],
"is_new_best": score == run["best_score"] and score > 0
})
@app.get("/opro/runs", tags=["opro-true"])
def opro_list_runs():
"""
List all OPRO optimization runs.
"""
runs = list_opro_runs()
return ok({"runs": runs, "total": len(runs)})
@app.get("/opro/run/{run_id}", tags=["opro-true"])
def opro_get_run(run_id: str):
"""
Get detailed information about an OPRO run.
"""
run = get_opro_run(run_id)
if not run:
raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND")
# Get sorted trajectory
trajectory = get_opro_trajectory(run_id)
return ok({
"run_id": run_id,
"task_description": run["task_description"],
"iteration": run["iteration"],
"status": run["status"],
"best_score": run["best_score"],
"best_instruction": run["best_instruction"],
"num_test_cases": len(run["test_cases"]),
"test_cases": [{"input": tc[0], "expected_output": tc[1]} for tc in run["test_cases"]],
"trajectory": [{"instruction": inst, "score": score} for inst, score in trajectory[:10]], # Top 10
"current_candidates": run["current_candidates"]
})
@app.post("/opro/test_cases", tags=["opro-true"])
def opro_add_test_cases(req: OPROAddTestCasesReq):
"""
Add or update test cases for an OPRO run.
"""
run = get_opro_run(req.run_id)
if not run:
raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND")
# Convert test cases
test_cases = [(tc.input, tc.expected_output) for tc in req.test_cases]
# Update test cases
set_opro_test_cases(req.run_id, test_cases)
return ok({
"run_id": req.run_id,
"num_test_cases": len(test_cases),
"test_cases": [{"input": tc[0], "expected_output": tc[1]} for tc in test_cases]
})
@app.post("/opro/generate_and_evaluate", tags=["opro-true"])
def opro_generate_and_evaluate(req: OPROGenerateAndEvaluateReq):
"""
Generate candidates and auto-evaluate them (for chat-like UX).
This is the main endpoint for the chat interface. It:
1. Generates candidates based on trajectory
2. Auto-evaluates them (if test cases exist and auto_evaluate=True)
3. Returns top-k sorted by score (or diversity if no evaluation)
"""
run = get_opro_run(req.run_id)
if not run:
raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND")
top_k = req.top_k or config.TOP_K
pool_size = req.pool_size or config.GENERATION_POOL_SIZE
# Get trajectory for optimization
trajectory = get_opro_trajectory(req.run_id)
# Generate candidates
try:
candidates = generate_system_instruction_candidates(
task_description=run["task_description"],
trajectory=trajectory if trajectory else None,
top_k=pool_size, # Generate pool_size candidates first
pool_size=pool_size,
model_name=run["model_name"]
)
except Exception as e:
raise AppException(500, f"Failed to generate candidates: {e}", "GENERATION_ERROR")
# Decide whether to evaluate
should_evaluate = req.auto_evaluate and len(run["test_cases"]) > 0
if should_evaluate:
# Auto-evaluate all candidates
scored_candidates = []
for candidate in candidates:
try:
score = evaluate_system_instruction(
system_instruction=candidate,
test_cases=run["test_cases"],
model_name=run["model_name"]
)
scored_candidates.append({"instruction": candidate, "score": score})
# Add to trajectory
add_opro_evaluation(req.run_id, candidate, score)
except Exception as e:
# If evaluation fails, assign score 0
scored_candidates.append({"instruction": candidate, "score": 0.0})
# Sort by score (highest first)
scored_candidates.sort(key=lambda x: x["score"], reverse=True)
# Return top-k
top_candidates = scored_candidates[:top_k]
# Update iteration
update_opro_iteration(req.run_id, [c["instruction"] for c in top_candidates])
return ok({
"run_id": req.run_id,
"candidates": top_candidates,
"iteration": run["iteration"] + 1,
"evaluated": True,
"best_score": run["best_score"]
})
else:
# No evaluation - use diversity-based selection (already done by clustering)
# Just return the candidates without scores
top_candidates = [
{"instruction": candidate, "score": None}
for candidate in candidates[:top_k]
]
# Update iteration
update_opro_iteration(req.run_id, [c["instruction"] for c in top_candidates])
return ok({
"run_id": req.run_id,
"candidates": top_candidates,
"iteration": run["iteration"] + 1,
"evaluated": False,
"best_score": run["best_score"]
})
@app.post("/opro/execute", tags=["opro-true"])
def opro_execute(req: OPROExecuteReq):
"""
Execute a system instruction with user input.
This uses the selected instruction as a system prompt and calls the LLM.
"""
try:
# Construct full prompt with system instruction
full_prompt = f"{req.instruction}\n\n{req.user_input}"
# Call LLM
response = call_qwen(
full_prompt,
temperature=0.2,
max_tokens=1024,
model_name=req.model_name
)
return ok({
"instruction": req.instruction,
"user_input": req.user_input,
"response": response
})
except Exception as e:
raise AppException(500, f"Execution failed: {e}", "EXECUTION_ERROR")

View File

@@ -1,4 +1,14 @@
from typing import List, Tuple
# ============================================================================
# OLD FUNCTIONS (Query Rewriting - NOT true OPRO, kept for compatibility)
# ============================================================================
def refine_instruction(query: str) -> str:
"""
LEGACY: Generates query rewrites (NOT true OPRO).
This is query expansion, not system instruction optimization.
"""
return f"""
你是一个“问题澄清与重写助手”。
请根据用户的原始问题:
@@ -7,6 +17,9 @@ def refine_instruction(query: str) -> str:
"""
def refine_instruction_with_history(query: str, rejected_list: list) -> str:
"""
LEGACY: Generates query rewrites with rejection history (NOT true OPRO).
"""
rejected_text = "\n".join(f"- {r}" for r in rejected_list) if rejected_list else ""
return f"""
你是一个“问题澄清与重写助手”。
@@ -18,3 +31,100 @@ def refine_instruction_with_history(query: str, rejected_list: list) -> str:
请从新的角度重新生成至少20条不同的改写问题每条单独一行。
"""
# ============================================================================
# TRUE OPRO FUNCTIONS (System Instruction Optimization)
# ============================================================================
def generate_initial_system_instruction_candidates(task_description: str, pool_size: int = None) -> str:
"""
TRUE OPRO: Generates initial candidate System Instructions for a new OPRO run.
Args:
task_description: Description of the task the LLM should perform
pool_size: Number of candidates to generate (defaults to config.GENERATION_POOL_SIZE)
Returns:
Meta-prompt that instructs the optimizer LLM to generate system instruction candidates
"""
import config
pool_size = pool_size or config.GENERATION_POOL_SIZE
return f"""
你是一个"系统指令生成助手"
目标任务描述:
{task_description}
请根据以上任务,生成 {pool_size} 条高质量、风格各异的"System Instruction"候选指令。
要求:
1. 每条指令必须有明显不同的风格和侧重点
2. 覆盖不同的实现策略(例如:简洁型、详细型、示例型、角色扮演型、步骤型等)
3. 这些指令应指导LLM的行为和输出格式以最大化任务性能
4. 每条指令单独成行,不包含编号或额外说明
5. 所有生成的指令必须使用简体中文
生成 {pool_size} 条指令:
"""
def generate_optimized_system_instruction(
task_description: str,
trajectory: List[Tuple[str, float]],
pool_size: int = None
) -> str:
"""
TRUE OPRO: Analyzes performance trajectory and generates optimized System Instructions.
This is the core OPRO function that uses an LLM as an optimizer to improve
system instructions based on historical performance scores.
Args:
task_description: Description of the task the LLM should perform
trajectory: List of (instruction, score) tuples, sorted by score (highest first)
pool_size: Number of candidates to generate (defaults to config.GENERATION_POOL_SIZE)
Returns:
Meta-prompt that instructs the optimizer LLM to generate better system instructions
"""
import config
pool_size = pool_size or config.GENERATION_POOL_SIZE
if not trajectory:
# If no trajectory, fall back to initial generation
return generate_initial_system_instruction_candidates(task_description, pool_size)
# Format the trajectory for the Optimizer LLM
formatted_history = "\n".join(
f"--- Instruction Score: {score:.4f}\n{instruction}"
for instruction, score in trajectory
)
# Determine the current highest score to set the optimization goal
highest_score = max(score for _, score in trajectory)
# Construct the Meta-Prompt (The OPRO Instruction)
return f"""
你是一个"System Prompt 优化器"
你的任务是改进一个LLM的系统指令以最大化其在以下任务中的性能
{task_description}
---
**历史性能轨迹 (Instructions and Scores):**
{formatted_history}
---
**当前最高得分: {highest_score:.4f}**
请分析得分最高的指令的特点和得分最低指令的缺陷。
然后,生成 {pool_size} 条新的、有潜力超越 {highest_score:.4f} 分的System Instruction。
要求:
1. 每条指令必须有明显不同的改进策略
2. 结合高分指令的优点,避免低分指令的缺陷
3. 探索新的优化方向和表达方式
4. 每条指令单独成行,不包含编号或额外说明
5. 所有生成的指令必须使用简体中文
生成 {pool_size} 条优化后的指令:
"""

View File

@@ -1,8 +1,14 @@
import uuid
from typing import List, Tuple, Dict, Any
# Legacy session storage (for query rewriting)
SESSIONS = {}
USER_FEEDBACK_LOG = []
# OPRO session storage (for system instruction optimization)
OPRO_RUNS = {}
OPRO_RUN_LOG = []
def create_session(query: str) -> str:
sid = uuid.uuid4().hex
SESSIONS[sid] = {
@@ -54,3 +60,167 @@ def set_session_model(sid: str, model_name: str | None):
s = SESSIONS.get(sid)
if s is not None:
s["model_name"] = model_name
# ============================================================================
# TRUE OPRO SESSION MANAGEMENT
# ============================================================================
def create_opro_run(
task_description: str,
test_cases: List[Tuple[str, str]] = None,
model_name: str = None
) -> str:
"""
Create a new OPRO optimization run.
Args:
task_description: Description of the task to optimize for
test_cases: List of (input, expected_output) tuples for evaluation
model_name: Optional model name to use
Returns:
run_id: Unique identifier for this OPRO run
"""
run_id = uuid.uuid4().hex
OPRO_RUNS[run_id] = {
"task_description": task_description,
"test_cases": test_cases or [],
"model_name": model_name,
"iteration": 0,
"trajectory": [], # List of (instruction, score) tuples
"best_instruction": None,
"best_score": 0.0,
"current_candidates": [],
"created_at": uuid.uuid1().time,
"status": "active" # active, completed, failed
}
return run_id
def get_opro_run(run_id: str) -> Dict[str, Any]:
"""Get OPRO run by ID."""
return OPRO_RUNS.get(run_id)
def update_opro_iteration(
run_id: str,
candidates: List[str],
scores: List[float] = None
):
"""
Update OPRO run with new iteration results.
Args:
run_id: OPRO run identifier
candidates: List of system instruction candidates
scores: Optional list of scores (if evaluated)
"""
run = OPRO_RUNS.get(run_id)
if not run:
return
run["iteration"] += 1
run["current_candidates"] = candidates
# If scores provided, update trajectory
if scores and len(scores) == len(candidates):
for candidate, score in zip(candidates, scores):
run["trajectory"].append((candidate, score))
# Update best if this is better
if score > run["best_score"]:
run["best_score"] = score
run["best_instruction"] = candidate
# Log the iteration
OPRO_RUN_LOG.append({
"run_id": run_id,
"iteration": run["iteration"],
"num_candidates": len(candidates),
"best_score": run["best_score"]
})
def add_opro_evaluation(
run_id: str,
instruction: str,
score: float
):
"""
Add a single evaluation result to OPRO run.
Args:
run_id: OPRO run identifier
instruction: System instruction that was evaluated
score: Performance score
"""
run = OPRO_RUNS.get(run_id)
if not run:
return
# Add to trajectory
run["trajectory"].append((instruction, score))
# Update best if this is better
if score > run["best_score"]:
run["best_score"] = score
run["best_instruction"] = instruction
def get_opro_trajectory(run_id: str) -> List[Tuple[str, float]]:
"""
Get the performance trajectory for an OPRO run.
Returns:
List of (instruction, score) tuples sorted by score (highest first)
"""
run = OPRO_RUNS.get(run_id)
if not run:
return []
trajectory = run["trajectory"]
return sorted(trajectory, key=lambda x: x[1], reverse=True)
def set_opro_test_cases(
run_id: str,
test_cases: List[Tuple[str, str]]
):
"""
Set or update test cases for an OPRO run.
Args:
run_id: OPRO run identifier
test_cases: List of (input, expected_output) tuples
"""
run = OPRO_RUNS.get(run_id)
if run:
run["test_cases"] = test_cases
def complete_opro_run(run_id: str):
"""Mark an OPRO run as completed."""
run = OPRO_RUNS.get(run_id)
if run:
run["status"] = "completed"
def list_opro_runs() -> List[Dict[str, Any]]:
"""
List all OPRO runs with summary information.
Returns:
List of run summaries
"""
return [
{
"run_id": run_id,
"task_description": run["task_description"][:100] + "..." if len(run["task_description"]) > 100 else run["task_description"],
"iteration": run["iteration"],
"best_score": run["best_score"],
"num_test_cases": len(run["test_cases"]),
"status": run["status"]
}
for run_id, run in OPRO_RUNS.items()
]

View File

@@ -1,12 +1,18 @@
import re
import numpy as np
from typing import List, Tuple
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import config
from .ollama_client import call_qwen
from .xinference_client import embed_texts
from .prompt_utils import refine_instruction, refine_instruction_with_history
from .prompt_utils import (
refine_instruction,
refine_instruction_with_history,
generate_initial_system_instruction_candidates,
generate_optimized_system_instruction
)
def parse_candidates(raw: str) -> list:
lines = [l.strip() for l in re.split(r'\r?\n', raw) if l.strip()]
@@ -33,7 +39,7 @@ def cluster_and_select(candidates: list, top_k=config.TOP_K, distance_threshold=
linkage="average")
labels = clustering.fit_predict(X)
selected_idx = []
selected_idx = []
for label in sorted(set(labels)):
idxs = [i for i,l in enumerate(labels) if l == label]
sims = cosine_similarity(X[idxs]).mean(axis=1)
@@ -44,6 +50,10 @@ def cluster_and_select(candidates: list, top_k=config.TOP_K, distance_threshold=
return selected[:top_k]
def generate_candidates(query: str, rejected=None, top_k=config.TOP_K, model_name=None):
"""
LEGACY: Query rewriting function (NOT true OPRO).
Kept for backward compatibility with existing API endpoints.
"""
rejected = rejected or []
if rejected:
prompt = refine_instruction_with_history(query, rejected)
@@ -53,3 +63,87 @@ def generate_candidates(query: str, rejected=None, top_k=config.TOP_K, model_nam
raw = call_qwen(prompt, temperature=0.9, max_tokens=1024, model_name=model_name)
all_candidates = parse_candidates(raw)
return cluster_and_select(all_candidates, top_k=top_k)
# ============================================================================
# TRUE OPRO FUNCTIONS (System Instruction Optimization)
# ============================================================================
def generate_system_instruction_candidates(
task_description: str,
trajectory: List[Tuple[str, float]] = None,
top_k: int = config.TOP_K,
pool_size: int = None,
model_name: str = None
) -> List[str]:
"""
TRUE OPRO: Generates optimized system instruction candidates.
This is the core OPRO function that generates system instructions based on
performance trajectory (if available) or initial candidates (if starting fresh).
Args:
task_description: Description of the task the LLM should perform
trajectory: Optional list of (instruction, score) tuples from previous iterations
top_k: Number of diverse candidates to return (default: config.TOP_K = 5)
pool_size: Number of candidates to generate before clustering (default: config.GENERATION_POOL_SIZE = 10)
model_name: Optional model name to use for generation
Returns:
List of top-k diverse system instruction candidates
"""
pool_size = pool_size or config.GENERATION_POOL_SIZE
# Generate the meta-prompt based on whether we have trajectory data
if trajectory and len(trajectory) > 0:
# Sort trajectory by score (highest first)
sorted_trajectory = sorted(trajectory, key=lambda x: x[1], reverse=True)
meta_prompt = generate_optimized_system_instruction(task_description, sorted_trajectory, pool_size)
else:
# No trajectory yet, generate initial candidates
meta_prompt = generate_initial_system_instruction_candidates(task_description, pool_size)
# Use the optimizer LLM to generate candidates
raw = call_qwen(meta_prompt, temperature=0.9, max_tokens=1024, model_name=model_name)
# Parse the generated candidates
all_candidates = parse_candidates(raw)
# Cluster and select diverse representatives
return cluster_and_select(all_candidates, top_k=top_k)
def evaluate_system_instruction(
system_instruction: str,
test_cases: List[Tuple[str, str]],
model_name: str = None
) -> float:
"""
TRUE OPRO: Evaluates a system instruction's performance on test cases.
Args:
system_instruction: The system instruction to evaluate
test_cases: List of (input, expected_output) tuples
model_name: Optional model name to use for evaluation
Returns:
Performance score (0.0 to 1.0)
"""
if not test_cases:
return 0.0
correct = 0
total = len(test_cases)
for input_text, expected_output in test_cases:
# Construct the full prompt with system instruction
full_prompt = f"{system_instruction}\n\n{input_text}"
# Get LLM response
response = call_qwen(full_prompt, temperature=0.2, max_tokens=512, model_name=model_name)
# Simple exact match scoring (can be replaced with more sophisticated metrics)
if expected_output.strip().lower() in response.strip().lower():
correct += 1
return correct / total