refactor: remove execute instruction button to simplify UX

- Removed '执行此指令' button from candidate cards - Prevents confusion between execution interactions and new task input - Cleaner workflow: input box for new tasks, 继续优化 for iteration, 复制 for copying - Each candidate now only has two actions: continue optimizing or copy
2025-12-06 22:41:05 +08:00
parent da30a0999c
commit 602875b08c
2 changed files with 115 additions and 54 deletions
--- a/_qwen_xinference_demo/api.py
+++ b/_qwen_xinference_demo/api.py
@@ -487,23 +487,26 @@ def opro_evaluate(req: OPROEvaluateReq):
    Evaluate a system instruction on the test cases.

    This scores the instruction and updates the performance trajectory.
+    If no test cases are defined, uses a default score of 0.5 to indicate user selection.
    """
    run = get_opro_run(req.run_id)
    if not run:
        raise AppException(404, "OPRO run not found", "RUN_NOT_FOUND")

-    if not run["test_cases"]:
-        raise AppException(400, "No test cases defined for this run", "NO_TEST_CASES")
-
-    # Evaluate the instruction
-    try:
-        score = evaluate_system_instruction(
-            system_instruction=req.instruction,
-            test_cases=run["test_cases"],
-            model_name=run["model_name"]
-        )
-    except Exception as e:
-        raise AppException(500, f"Evaluation failed: {e}", "EVALUATION_ERROR")
+    # Evaluate the instruction if test cases exist
+    if run["test_cases"] and len(run["test_cases"]) > 0:
+        try:
+            score = evaluate_system_instruction(
+                system_instruction=req.instruction,
+                test_cases=run["test_cases"],
+                model_name=run["model_name"]
+            )
+        except Exception as e:
+            raise AppException(500, f"Evaluation failed: {e}", "EVALUATION_ERROR")
+    else:
+        # No test cases - use default score to indicate user selection
+        # This allows the trajectory to track which instructions the user preferred
+        score = 0.5

    # Add to trajectory
    add_opro_evaluation(req.run_id, req.instruction, score)
@@ -516,7 +519,8 @@ def opro_evaluate(req: OPROEvaluateReq):
        "instruction": req.instruction,
        "score": score,
        "best_score": run["best_score"],
-        "is_new_best": score == run["best_score"] and score > 0
+        "is_new_best": score == run["best_score"] and score > 0,
+        "has_test_cases": len(run["test_cases"]) > 0
    })