1036 lines
42 KiB
Python
1036 lines
42 KiB
Python
# Copyright 2023 The OPRO Authors
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
"""The utility functions for prompt optimization."""
|
||
|
||
import collections
|
||
import json
|
||
import os
|
||
import pickle
|
||
import re
|
||
import sys
|
||
|
||
OPRO_ROOT_PATH = os.path.dirname(
|
||
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
||
)
|
||
sys.path.insert(0, OPRO_ROOT_PATH)
|
||
|
||
import numpy as np
|
||
from opro.evaluation import eval_utils
|
||
import pandas as pd
|
||
|
||
|
||
def extract_string_in_square_brackets(input_string):
|
||
raw_result = re.findall(r"\[.*?\]", input_string)
|
||
if raw_result:
|
||
return raw_result[0][1:-1]
|
||
else:
|
||
return "" #从输入的字符串 input_string中提取第一个被方括号 []包裹的内容,并返回去掉方括号后的结果。如果输入字符串中没有方括号,则返回空字符串 ""。
|
||
|
||
|
||
def parse_tag_content(text, prefix="<TEXT>", suffix="</TEXT>"):
|
||
pattern = f"{prefix}(.*?){suffix}"
|
||
results = re.findall(pattern, text, re.DOTALL)
|
||
return results
|
||
|
||
|
||
def _bucketize_float(num, n_buckets=20):
|
||
assert num >= 0 and num <= 1, "The given number must be between 0 and 1."
|
||
return round(num * n_buckets) #此函数可能用于将指令的评分(如 0.85)转换为整数分桶
|
||
|
||
|
||
def gen_ins_and_score_pairs_substr(
|
||
old_instructions_and_scores,
|
||
old_instruction_score_threshold=0.1,
|
||
max_num_instructions=1000,
|
||
return_str_only=False,
|
||
num_score_buckets=np.inf,
|
||
):
|
||
"""Generate the string that includes instruction-score pairs."""
|
||
assert num_score_buckets == np.inf or isinstance(num_score_buckets, int)
|
||
old_instructions_and_scores_str = ""
|
||
old_instructions_and_scores = sorted(
|
||
old_instructions_and_scores, key=lambda x: x[1]
|
||
)[-max_num_instructions:]
|
||
old_instructions_and_scores_in_meta_prompt = []
|
||
for instruction, score, i_step in old_instructions_and_scores:
|
||
if (
|
||
not old_instruction_score_threshold
|
||
or score >= old_instruction_score_threshold
|
||
):
|
||
old_instructions_and_scores_in_meta_prompt.append(
|
||
(instruction, score, i_step)
|
||
)
|
||
if num_score_buckets == np.inf:
|
||
score_to_show = round(score, 3)
|
||
else:
|
||
score_to_show = _bucketize_float(score, num_score_buckets)
|
||
old_instructions_and_scores_str += (
|
||
f"\ntext:\n{instruction}\nscore:\n{score_to_show}\n"
|
||
)
|
||
if return_str_only:
|
||
return old_instructions_and_scores_str
|
||
else:
|
||
return (
|
||
old_instructions_and_scores_str,
|
||
old_instructions_and_scores_in_meta_prompt,
|
||
)
|
||
|
||
|
||
def gen_meta_prompt(
|
||
old_instructions_and_scores,
|
||
instruction_pos,
|
||
optimizer_llm_name,
|
||
old_instruction_score_threshold=0.1,
|
||
max_num_instructions=1000,
|
||
meta_prompt_type="both_instructions_and_exemplars",
|
||
few_shot_qa_pairs=False,
|
||
include_qa=True,
|
||
data=None,
|
||
few_shot_index_list=None,
|
||
instructions_before_exemplars=True,
|
||
num_score_buckets=np.inf,
|
||
dataset_name="",
|
||
task_name="",
|
||
):
|
||
"""Generate meta prompt for instruction rewriting.
|
||
|
||
Args:
|
||
old_instructions_and_scores (list): a list of (instruction, score, i_step)
|
||
pairs.
|
||
instruction_pos (str): where to put the instruction, one of {'before_QA',
|
||
'Q_begin', 'Q_end', 'A_begin'}.
|
||
optimizer_llm_name (str): the name of the LLM used for instruction editing.
|
||
old_instruction_score_threshold (float): only add old instructions with score
|
||
no less than this threshold.
|
||
max_num_instructions (int): the maximum number of instructions in the meta
|
||
prompt.
|
||
meta_prompt_type (str): the type of meta-prompt: whether to have both
|
||
previous instructions and dataset exemplars (often for fine-tuned
|
||
optimizers), or to have only previous instructions (often for pre-trained
|
||
optimizers).
|
||
few_shot_qa_pairs (bool): whether to have few-shot QA pairs in the meta
|
||
prompt.
|
||
include_qa (bool): whether to include "Q:" and "A:" formats in the prompt.
|
||
data (list or pd.DataFrame): the raw data.
|
||
few_shot_index_list (list): the list of indices of few-shot examples.
|
||
instructions_before_exemplars (bool): whether the instruction-score pairs are
|
||
before the exemplars from the dataset.
|
||
num_score_buckets (np.inf or int): the number of score buckets when we
|
||
convert float accuracies to integers. Default to np.inf for not
|
||
bucketizing.
|
||
dataset_name (str): the name of the current dataset. Only used when
|
||
generating task description when meta_prompt_type == "instructions_only".
|
||
task_name (str): the name of the current task. Only used when generating task
|
||
description when meta_prompt_type == "instructions_only".
|
||
|
||
Returns:
|
||
meta_prompt (str): the generated meta prompt.
|
||
"""
|
||
assert instruction_pos in {
|
||
"before_Q",
|
||
"Q_begin",
|
||
"Q_end",
|
||
"A_begin",
|
||
}, (
|
||
"The instruction position should be either before the question, or at the"
|
||
" beginning of the question, at the end of the question, or at the"
|
||
" beginning of the answer."
|
||
)
|
||
assert meta_prompt_type in {
|
||
"both_instructions_and_exemplars",
|
||
"instructions_only",
|
||
}
|
||
assert dataset_name in {
|
||
"mmlu",
|
||
"bbh",
|
||
"gsm8k",
|
||
}, "The lower-case dataset name must be one of mmlu, bbh, gsm8k."
|
||
assert num_score_buckets == np.inf or isinstance(num_score_buckets, int)
|
||
|
||
meta_prompt = ""
|
||
if meta_prompt_type == "both_instructions_and_exemplars":
|
||
if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4","local"}: # GPT模型的提示模板
|
||
if instruction_pos == "A_begin":# 针对答案开头部分的描述
|
||
meta_prompt_old_instruction_part = (
|
||
"Your task is to generate the answer starting sentence <Start>."
|
||
" Below are some previous starting sentences with their scores."
|
||
" The score ranges from 0 to 100.\n"
|
||
)#要求模型生成答案的起始句(如“The answer is...”),并参考历史评分数据。
|
||
else:# 针对普通指令的描述
|
||
meta_prompt_old_instruction_part = (
|
||
"Your task is to generate the instruction <INS>."
|
||
" Below are some previous instructions with their scores."
|
||
" The score ranges from 0 to 100.\n"
|
||
)
|
||
else: # text-bison 模型的专用提示模板
|
||
assert optimizer_llm_name.lower() == "text-bison"
|
||
meta_prompt_old_instruction_part = (
|
||
"I have some texts along with their corresponding scores."
|
||
" The texts are arranged in ascending order based on their scores,"
|
||
" where higher scores indicate better quality.\n\n"
|
||
)
|
||
# add old instructions
|
||
old_instructions_and_scores_str = gen_ins_and_score_pairs_substr(
|
||
old_instructions_and_scores=old_instructions_and_scores,
|
||
old_instruction_score_threshold=old_instruction_score_threshold,
|
||
max_num_instructions=max_num_instructions,
|
||
return_str_only=True,
|
||
num_score_buckets=num_score_buckets,
|
||
)
|
||
meta_prompt_old_instruction_part += old_instructions_and_scores_str
|
||
# add QA pairs if few_shot_qa_pairs == True
|
||
meta_prompt_exemplar_part = ""
|
||
if few_shot_qa_pairs:
|
||
if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
||
meta_prompt_exemplar_part += "Below are some problems.\n"
|
||
else:
|
||
assert optimizer_llm_name.lower() == "text-bison"
|
||
meta_prompt_exemplar_part += (
|
||
"The following exemplars show how to apply your text: you replace"
|
||
" <INS> in each input with your text, then read the input and give"
|
||
" an output. We say your output is wrong if your output is"
|
||
" different from the given output, and we say your output is"
|
||
" correct if they are the same. When replacing <INS> with an old"
|
||
" piece of text above, we get wrong outputs on the following"
|
||
" inputs.\n\n"
|
||
)
|
||
for idx in few_shot_index_list:
|
||
if dataset_name == "mmlu":
|
||
question = eval_utils._format_mmlu_example(data, idx) # pylint: disable=protected-access
|
||
true_answer = data.iloc[idx, -1]
|
||
elif dataset_name == "bbh":
|
||
question = data[idx]["input"]
|
||
true_answer = data[idx]["target"]
|
||
else:
|
||
assert dataset_name == "gsm8k"
|
||
question = data.iloc[idx, 0]
|
||
true_answer = data.iloc[idx, 1]
|
||
|
||
if include_qa: # when "Q:" and "A:" are present in the prompt
|
||
if instruction_pos == "before_Q":
|
||
meta_prompt_exemplar_part += f"\ninput:\n<INS>\nQ: {question}\nA:"
|
||
elif instruction_pos == "Q_begin":
|
||
meta_prompt_exemplar_part += f"\ninput:\nQ: <INS>\n{question}\nA:"
|
||
elif instruction_pos == "Q_end":
|
||
meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\n<INS>\nA:"
|
||
else: # instruction_pos == "A_begin"
|
||
if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
||
meta_prompt_exemplar_part += f"\nQ: {question}\nA: <Start>"
|
||
else:
|
||
assert optimizer_llm_name.lower() == "text-bison"
|
||
meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\nA: <INS>"
|
||
else: # when there're no "Q:" and "A:" in the prompt
|
||
assert instruction_pos in {"Q_begin", "Q_end"}
|
||
if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
||
if instruction_pos == "Q_begin":
|
||
meta_prompt_exemplar_part += f"\nProblem:\n<INS>\n{question}\n"
|
||
elif instruction_pos == "Q_end":
|
||
meta_prompt_exemplar_part += f"\nProblem:\n{question}\n<INS>\n"
|
||
else:
|
||
assert optimizer_llm_name.lower() == "text-bison"
|
||
if instruction_pos == "Q_begin":
|
||
meta_prompt_exemplar_part += f"\ninput:\n<INS>\n{question}\n"
|
||
elif instruction_pos == "Q_end":
|
||
meta_prompt_exemplar_part += f"\ninput:\n{question}\n<INS>\n"
|
||
|
||
if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
||
meta_prompt_exemplar_part += (
|
||
f"\nGround truth answer:\n{true_answer}\n"
|
||
)
|
||
else:
|
||
assert optimizer_llm_name.lower() == "text-bison"
|
||
meta_prompt_exemplar_part += f"\noutput:\n{true_answer}\n"
|
||
|
||
if few_shot_qa_pairs:
|
||
if instructions_before_exemplars:
|
||
meta_prompt += (
|
||
meta_prompt_old_instruction_part
|
||
+ "\n\n"
|
||
+ meta_prompt_exemplar_part
|
||
)
|
||
else:
|
||
meta_prompt += (
|
||
meta_prompt_exemplar_part
|
||
+ "\n\n"
|
||
+ meta_prompt_old_instruction_part
|
||
)
|
||
else:
|
||
meta_prompt += meta_prompt_old_instruction_part
|
||
|
||
if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
||
if instruction_pos == "A_begin":
|
||
meta_prompt += (
|
||
"\n\nGenerate a starting sentence that is different from all the"
|
||
" <Start> sentences above, and has a higher score than all the"
|
||
" <Start> sentences above. The starting sentence should begin with"
|
||
" <Start> and end with </Start>. The starting sentence should be"
|
||
" concise, effective, and generally applicable to all QA pairs"
|
||
" above."
|
||
)
|
||
else:
|
||
meta_prompt += (
|
||
"\n\nGenerate an instruction that"
|
||
" is different from all the instructions <INS> above,"
|
||
" and has a higher score than all the instructions <INS> above."
|
||
" The instruction should begin with <INS> and end with </INS>."
|
||
" The instruction should be concise, effective,"
|
||
" and generally applicable to all problems above."
|
||
)
|
||
else:
|
||
assert optimizer_llm_name.lower() == "text-bison"
|
||
meta_prompt += (
|
||
"\n\nWrite your new text that is different from the old ones and"
|
||
" has a score as high as possible. Write the text in square brackets."
|
||
)
|
||
else:
|
||
# when using a pre-trained model as optimizer
|
||
assert meta_prompt_type == "instructions_only"
|
||
|
||
assert instruction_pos in {"Q_begin", "Q_end", "A_begin"}
|
||
if instruction_pos == "Q_begin":
|
||
instruction_pos_description = "at the beginning of the question"
|
||
elif instruction_pos == "Q_end":
|
||
instruction_pos_description = "at the end of the question"
|
||
else:
|
||
assert instruction_pos == "A_begin"
|
||
instruction_pos_description = "at the beginning of the answer"
|
||
|
||
if dataset_name == "gsm8k":
|
||
instruction_task_description = "grade school math"
|
||
elif dataset_name == "mmlu":
|
||
instruction_task_description = task_name
|
||
else:
|
||
assert dataset_name == "bbh"
|
||
instruction_task_description = " ".join(task_name.split("_"))
|
||
|
||
meta_instruction = (
|
||
f"Create a piece of text {instruction_pos_description.strip()} to"
|
||
" enhance the precision in solving diverse"
|
||
f" {instruction_task_description.strip()} problems."
|
||
)
|
||
old_instructions_and_scores = sorted(
|
||
old_instructions_and_scores, key=lambda x: x[1]
|
||
)
|
||
old_instructions_and_scores_str = ""
|
||
for instruction, score, _ in old_instructions_and_scores:
|
||
if num_score_buckets == np.inf:
|
||
score_to_show = round(score, 2)
|
||
else:
|
||
score_to_show = _bucketize_float(score, num_score_buckets)
|
||
old_instructions_and_scores_str += (
|
||
f"\n\nPrecision: {score_to_show} <TEXT>{instruction}</TEXT>"
|
||
)
|
||
meta_prompt += meta_instruction + old_instructions_and_scores_str
|
||
return meta_prompt
|
||
|
||
|
||
def run_evolution(**kwargs):
|
||
"""The function for evolution."""
|
||
# ================= experiment configurations =============================
|
||
num_search_steps = kwargs["num_search_steps"]
|
||
old_instruction_score_threshold = kwargs["old_instruction_score_threshold"]
|
||
scorer_llm_dict = kwargs["scorer_llm_dict"]
|
||
optimizer_llm_dict = kwargs["optimizer_llm_dict"]
|
||
extract_final_answer_by_prompting_again = kwargs[
|
||
"extract_final_answer_by_prompting_again"
|
||
]
|
||
include_qa = kwargs["include_qa"]
|
||
evaluate_in_parallel = kwargs["evaluate_in_parallel"]
|
||
tasks_all = kwargs["tasks_all"]
|
||
train_ratio = kwargs["train_ratio"]
|
||
eval_ratio = kwargs["eval_ratio"]
|
||
test_ratio = kwargs["test_ratio"]
|
||
train_index = kwargs["train_index"]
|
||
eval_index = kwargs["eval_index"]
|
||
dataset_name = kwargs["dataset_name"]
|
||
task_name = kwargs["task_name"]
|
||
num_examples = kwargs["num_examples"]
|
||
root_data_folder_path = kwargs["root_data_folder_path"]
|
||
optimizer_llm_temperature = kwargs["optimizer_llm_temperature"]
|
||
optimizer_llm_temperature_schedule = (
|
||
kwargs["optimizer_llm_temperature_schedule"]
|
||
if "optimizer_llm_temperature_schedule" in kwargs
|
||
else "constant"
|
||
)
|
||
optimizer_llm_temperature_end = (
|
||
kwargs["optimizer_llm_temperature_end"]
|
||
if "optimizer_llm_temperature_end" in kwargs
|
||
else None
|
||
)
|
||
initial_instructions = kwargs["initial_instructions"]
|
||
multiple_choice_tasks = kwargs["multiple_choice_tasks"]
|
||
raw_data = kwargs["raw_data"]
|
||
call_scorer_server_func = kwargs["call_scorer_server_func"]
|
||
call_optimizer_server_func = kwargs["call_optimizer_server_func"]
|
||
instruction_pos = kwargs["instruction_pos"]
|
||
prediction_treat_as_number = kwargs["prediction_treat_as_number"]
|
||
prediction_treat_as_bool = kwargs["prediction_treat_as_bool"]
|
||
result_by_instruction_folder = kwargs["result_by_instruction_folder"]
|
||
few_shot_qa_pairs = kwargs["few_shot_qa_pairs"]
|
||
num_score_buckets = kwargs["num_score_buckets"]
|
||
max_num_instructions = kwargs["max_num_instructions"]
|
||
meta_prompt_type = kwargs["meta_prompt_type"]
|
||
meta_prompt_instructions_before_exemplars = kwargs[
|
||
"meta_prompt_instructions_before_exemplars"
|
||
]
|
||
few_shot_selection_criteria = kwargs["few_shot_selection_criteria"]
|
||
optimizer_llm_name = kwargs["optimizer_llm_name"]
|
||
num_generated_instructions_in_each_step = kwargs[
|
||
"num_generated_instructions_in_each_step"
|
||
]
|
||
evaluate_generated_ins_on_few_shot = kwargs[
|
||
"evaluate_generated_ins_on_few_shot"
|
||
]
|
||
num_few_shot_questions_for_instruction_refinement = kwargs[
|
||
"num_few_shot_questions_for_instruction_refinement"
|
||
]
|
||
evaluate_old_ins_on_few_shot = kwargs["evaluate_old_ins_on_few_shot"]
|
||
eval_interval = kwargs["eval_interval"]
|
||
save_folder = kwargs["save_folder"]
|
||
verbose = kwargs["verbose"] if "verbose" in kwargs else False
|
||
|
||
# =================== assertions =====================
|
||
assert dataset_name in {
|
||
"mmlu",
|
||
"bbh",
|
||
"gsm8k",
|
||
}, "The lower-case dataset name must be one of mmlu, bbh, gsm8k."
|
||
assert optimizer_llm_temperature_schedule in {
|
||
"constant",
|
||
"linear_increase",
|
||
}, "The temperature schedule should be constant or linear_increase."
|
||
|
||
# =================== save configurations to json file ====================
|
||
configs_dict = dict()
|
||
configs_dict["scorer_llm_dict"] = scorer_llm_dict
|
||
configs_dict["optimizer_llm_dict"] = optimizer_llm_dict
|
||
configs_dict["instruction_pos"] = instruction_pos
|
||
configs_dict["optimizer_llm_temperature"] = optimizer_llm_temperature
|
||
configs_dict["optimizer_llm_temperature_schedule"] = (
|
||
optimizer_llm_temperature_schedule
|
||
)
|
||
configs_dict["optimizer_llm_temperature_end"] = optimizer_llm_temperature_end
|
||
with open(os.path.join(save_folder, "configs_dict.json"), "w") as f:
|
||
json.dump(configs_dict, f, indent=4)
|
||
|
||
num_servers = scorer_llm_dict["num_servers"]
|
||
batch_size = scorer_llm_dict["batch_size"]
|
||
generated_ins_on_few_shot_results_dict = dict()
|
||
old_ins_on_few_shot_results_dict = dict()
|
||
# evaluation results every a few steps
|
||
# format: [(i_step, instruction, detailed_results_df)]
|
||
eval_results = []
|
||
# all generated instructions, format: [(instruction, score, step_index)]
|
||
# the instructions that were skipped have score NaN
|
||
old_instructions_and_scores_raw = []
|
||
# the new instructions, format: [(instruction, score, step_index)]
|
||
old_instructions_and_scores = []
|
||
meta_prompts = [] # format: [(meta_prompt, step_index)]
|
||
instruction_score_dict = dict() # the dictionary of {instruction: score}
|
||
# the dictionary of the few-shot QA indices in meta-prompt
|
||
# key: step index; value: the list of few-shot indices in that step
|
||
few_shot_index_list_by_step_dict = dict()
|
||
detailed_results_df_by_instruction_dict = dict()
|
||
wrong_questions_from_start_counter = collections.Counter()
|
||
# EVAL results
|
||
eval_detailed_results_df_dict = dict() # {instruction: detailed_results_df}
|
||
instruction_eval_score_dict = dict() # {instruction: eval_score}
|
||
old_instruction_md5_hashstrings_set = set()
|
||
|
||
print(f"tasks_all: {tasks_all}")
|
||
print(
|
||
f"train_ratio: {train_ratio}, number of training points:"
|
||
f" {int(num_examples * train_ratio)}"
|
||
)
|
||
print(
|
||
f"eval_ratio: {eval_ratio}, number of eval points: "
|
||
f"{int(num_examples * eval_ratio)}"
|
||
)
|
||
print(
|
||
f"test_ratio: {test_ratio}, number of test points: "
|
||
f"{int(num_examples * test_ratio)}"
|
||
)
|
||
print(
|
||
f"optimizer llm temperature: {optimizer_llm_temperature}, schedule:"
|
||
f" {optimizer_llm_temperature_schedule}"
|
||
)
|
||
print(
|
||
f"generating {num_generated_instructions_in_each_step} instructions in"
|
||
f" each step, run for {num_search_steps} steps"
|
||
)
|
||
print(
|
||
"discarding generated instructions with score less than:"
|
||
f" {old_instruction_score_threshold} (old_instruction_score_threshold)"
|
||
)
|
||
print(f"num_score_buckets: {num_score_buckets}")
|
||
|
||
if dataset_name == "mmlu":
|
||
is_multiple_choice = True
|
||
is_multiple_choice_eval = True
|
||
elif dataset_name in {"gsm8k"}:
|
||
is_multiple_choice = False
|
||
is_multiple_choice_eval = False
|
||
else:
|
||
assert dataset_name == "bbh"
|
||
is_multiple_choice = []
|
||
is_multiple_choice_eval = []
|
||
train_index_by_task_dict = dict()
|
||
eval_index_by_task_dict = dict()
|
||
start_index = 0
|
||
for task_name in tasks_all:
|
||
single_task_list = eval_utils.load_bbh_task_data(
|
||
task_name, base_dir=root_data_folder_path
|
||
)
|
||
end_index = start_index + len(single_task_list)
|
||
train_index_by_task_dict[task_name] = (
|
||
train_index[(train_index >= start_index) & (train_index < end_index)]
|
||
# if " - start_index" is added here, then the dict would contain
|
||
# indices in the original task
|
||
)
|
||
eval_index_by_task_dict[task_name] = (
|
||
eval_index[(eval_index >= start_index) & (eval_index < end_index)]
|
||
# if " - start_index" is added here, then the dict would contain
|
||
# indices in the original task
|
||
)
|
||
start_index = end_index
|
||
is_multiple_choice_single_task_train = [
|
||
task_name in multiple_choice_tasks
|
||
] * len(train_index_by_task_dict[task_name])
|
||
is_multiple_choice_single_task_eval = [
|
||
task_name in multiple_choice_tasks
|
||
] * len(eval_index_by_task_dict[task_name])
|
||
is_multiple_choice += is_multiple_choice_single_task_train
|
||
is_multiple_choice_eval += is_multiple_choice_single_task_eval
|
||
|
||
prev_saved_instructions = set()
|
||
|
||
# evaluate initial instructions
|
||
print("\n============== evaluating initial instructions ===============")
|
||
for instruction in initial_instructions:
|
||
print(f"""computing the score of "{instruction}" by prompting""")
|
||
|
||
detailed_results_df = eval_utils.evaluate_single_instruction(
|
||
data=raw_data,
|
||
instruction=instruction,
|
||
eval_index_all=train_index,
|
||
batch_size=batch_size,
|
||
call_server_func=call_scorer_server_func,
|
||
dataset_name=dataset_name,
|
||
num_servers=num_servers,
|
||
extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
|
||
include_qa=include_qa,
|
||
evaluate_in_parallel=evaluate_in_parallel,
|
||
instruction_pos=instruction_pos,
|
||
is_multiple_choice=is_multiple_choice,
|
||
prediction_treat_as_number=prediction_treat_as_number,
|
||
prediction_treat_as_bool=prediction_treat_as_bool,
|
||
prediction_num_decimals=0,
|
||
max_retry=120,
|
||
sleep_time=60,
|
||
verbose=verbose,
|
||
)
|
||
|
||
detailed_results_df_by_instruction_dict[instruction] = detailed_results_df
|
||
scores = detailed_results_df["accuracy"]
|
||
average_score = np.average(scores)
|
||
print(f"instruction: {instruction}, score: {average_score}")
|
||
filename = eval_utils.instruction_to_filename(instruction)
|
||
file_path = os.path.join(result_by_instruction_folder, f"{filename}.csv")
|
||
detailed_results_df.to_csv(file_path, index=True, header=True)
|
||
print(f"""saving results of "{instruction}" to {file_path}""")
|
||
old_instructions_and_scores.append((instruction, average_score, -1))
|
||
old_instructions_and_scores_raw.append((instruction, average_score, -1))
|
||
instruction_score_dict[instruction] = average_score
|
||
|
||
# increment the counter on wrong questions
|
||
wrong_question_indices_set = set(
|
||
list(
|
||
detailed_results_df.iloc[
|
||
np.where(detailed_results_df.accuracy == 0.0)[0], :
|
||
].index
|
||
)
|
||
)
|
||
for idx in wrong_question_indices_set:
|
||
wrong_questions_from_start_counter[idx] += 1
|
||
|
||
# evolution
|
||
for i_step in range(num_search_steps):
|
||
print(f"\n================== Step {i_step} =====================")
|
||
if not i_step % 10:
|
||
print(f"old_instructions_and_scores: {old_instructions_and_scores}")
|
||
|
||
if optimizer_llm_temperature_schedule == "linear_increase":
|
||
optimizer_llm_temperature_curr = (
|
||
optimizer_llm_temperature
|
||
+ i_step
|
||
/ num_search_steps
|
||
* (optimizer_llm_temperature_end - optimizer_llm_temperature)
|
||
)
|
||
else:
|
||
optimizer_llm_temperature_curr = optimizer_llm_temperature
|
||
print(
|
||
f"current optimizer_llm_temperature: {optimizer_llm_temperature_curr}"
|
||
)
|
||
|
||
# generate new instructions
|
||
if few_shot_qa_pairs:
|
||
if few_shot_selection_criteria == "accumulative_most_frequent":
|
||
# select QA pairs that were done wrong the most number of times
|
||
most_frequent_wrong_question_indices = [
|
||
k
|
||
for k, _ in sorted(
|
||
wrong_questions_from_start_counter.items(), key=lambda x: -x[1]
|
||
)
|
||
]
|
||
print(
|
||
"len(most_frequent_wrong_question_indices):"
|
||
f" {len(most_frequent_wrong_question_indices)}"
|
||
)
|
||
if (
|
||
len(most_frequent_wrong_question_indices)
|
||
<= num_few_shot_questions_for_instruction_refinement
|
||
):
|
||
few_shot_index_list = most_frequent_wrong_question_indices.copy()
|
||
else:
|
||
np.random.seed(i_step)
|
||
few_shot_index_list = np.sort(
|
||
np.random.choice(
|
||
most_frequent_wrong_question_indices,
|
||
num_few_shot_questions_for_instruction_refinement,
|
||
replace=False,
|
||
)
|
||
)
|
||
|
||
elif few_shot_selection_criteria == "current_most_frequent":
|
||
# show exemplars done wrong most often by currently shown instructions
|
||
old_instruction_score_threshold_single_step = (
|
||
old_instruction_score_threshold if i_step > 0 else 0
|
||
)
|
||
_, old_instructions_and_scores_in_meta_prompt = (
|
||
gen_ins_and_score_pairs_substr(
|
||
old_instructions_and_scores=old_instructions_and_scores,
|
||
old_instruction_score_threshold=old_instruction_score_threshold_single_step,
|
||
max_num_instructions=max_num_instructions,
|
||
return_str_only=False,
|
||
num_score_buckets=num_score_buckets,
|
||
)
|
||
)
|
||
wrong_questions_counter_single_step = collections.Counter()
|
||
for ins, _, _ in old_instructions_and_scores_in_meta_prompt:
|
||
filename = eval_utils.instruction_to_filename(ins)
|
||
file_path = os.path.join(
|
||
result_by_instruction_folder, f"{filename}.csv"
|
||
)
|
||
single_ins_df = pd.read_csv(file_path, index_col=0, header=0)
|
||
wrong_question_indices_set_single_old_ins = set(
|
||
list(
|
||
single_ins_df.iloc[
|
||
np.where(single_ins_df.accuracy == 0.0)[0], :
|
||
].index
|
||
)
|
||
)
|
||
for idx in wrong_question_indices_set_single_old_ins:
|
||
wrong_questions_counter_single_step[idx] += 1
|
||
most_occurred_wrong_questions = [
|
||
k
|
||
for k, v in wrong_questions_counter_single_step.items()
|
||
if v == max(wrong_questions_counter_single_step.values())
|
||
]
|
||
if (
|
||
len(most_occurred_wrong_questions)
|
||
< num_few_shot_questions_for_instruction_refinement
|
||
):
|
||
# pylint: disable=cell-var-from-loop
|
||
idx_most_to_least = sorted(
|
||
wrong_questions_counter_single_step,
|
||
key=lambda x: -wrong_questions_counter_single_step[x],
|
||
)
|
||
few_shot_index_list = idx_most_to_least[
|
||
:num_few_shot_questions_for_instruction_refinement
|
||
]
|
||
else:
|
||
few_shot_index_list = np.sort(
|
||
np.random.choice(
|
||
most_occurred_wrong_questions,
|
||
num_few_shot_questions_for_instruction_refinement,
|
||
replace=False,
|
||
)
|
||
)
|
||
elif few_shot_selection_criteria == "constant":
|
||
np.random.seed(0)
|
||
few_shot_index_list = np.sort(
|
||
np.random.choice(
|
||
train_index,
|
||
num_few_shot_questions_for_instruction_refinement,
|
||
replace=False,
|
||
)
|
||
)
|
||
else:
|
||
assert few_shot_selection_criteria == "random"
|
||
np.random.seed(i_step)
|
||
few_shot_index_list = np.sort(
|
||
np.random.choice(
|
||
train_index,
|
||
num_few_shot_questions_for_instruction_refinement,
|
||
replace=False,
|
||
)
|
||
).tolist()
|
||
|
||
few_shot_index_list_by_step_dict[i_step] = few_shot_index_list
|
||
|
||
meta_prompt = gen_meta_prompt(
|
||
old_instructions_and_scores=old_instructions_and_scores,
|
||
instruction_pos=instruction_pos,
|
||
optimizer_llm_name=optimizer_llm_name,
|
||
old_instruction_score_threshold=old_instruction_score_threshold,
|
||
max_num_instructions=max_num_instructions,
|
||
meta_prompt_type=meta_prompt_type,
|
||
few_shot_qa_pairs=few_shot_qa_pairs,
|
||
include_qa=include_qa,
|
||
data=raw_data,
|
||
few_shot_index_list=few_shot_index_list,
|
||
instructions_before_exemplars=meta_prompt_instructions_before_exemplars,
|
||
num_score_buckets=num_score_buckets,
|
||
dataset_name=dataset_name,
|
||
task_name=task_name,
|
||
)
|
||
|
||
else: # no few-shot exemplars in meta-prompt
|
||
few_shot_index_list = []
|
||
meta_prompt = gen_meta_prompt(
|
||
old_instructions_and_scores=old_instructions_and_scores,
|
||
instruction_pos=instruction_pos,
|
||
optimizer_llm_name=optimizer_llm_name,
|
||
old_instruction_score_threshold=old_instruction_score_threshold,
|
||
max_num_instructions=max_num_instructions,
|
||
meta_prompt_type=meta_prompt_type,
|
||
few_shot_qa_pairs=False,
|
||
include_qa=include_qa,
|
||
instructions_before_exemplars=meta_prompt_instructions_before_exemplars,
|
||
num_score_buckets=num_score_buckets,
|
||
dataset_name=dataset_name,
|
||
task_name=task_name,
|
||
)
|
||
print(f"\nmeta_prompt: \n\n{meta_prompt}\n")
|
||
meta_prompts.append((meta_prompt, i_step))
|
||
remaining_num_instructions_to_generate = (
|
||
num_generated_instructions_in_each_step
|
||
)
|
||
generated_instructions_raw = []
|
||
while remaining_num_instructions_to_generate > 0:
|
||
optimizer_llm_input_text = meta_prompt
|
||
# generate instructions
|
||
print(f"current temperature: {optimizer_llm_temperature_curr}")
|
||
raw_outputs = call_optimizer_server_func(
|
||
optimizer_llm_input_text,
|
||
temperature=optimizer_llm_temperature_curr,
|
||
)
|
||
|
||
# Extract the generated instructions from the optimizer LLM output. Only
|
||
# keep some samples if the desired number of remaining instructions
|
||
# is smaller than the total number of decodes in this step.
|
||
if meta_prompt_type == "both_instructions_and_exemplars":
|
||
raw_outputs = raw_outputs[:remaining_num_instructions_to_generate]
|
||
if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
|
||
if instruction_pos == "A_begin":
|
||
start_string = "<Start>"
|
||
end_string = "</Start>"
|
||
else:
|
||
start_string = "<INS>+"
|
||
end_string = "</INS>"
|
||
for raw_output in raw_outputs:
|
||
if start_string not in raw_output:
|
||
start_index = 0
|
||
else:
|
||
start_index = raw_output.index(start_string) + len(start_string)
|
||
if end_string not in raw_output:
|
||
end_index = len(raw_output)
|
||
else:
|
||
end_index = raw_output.index(end_string)
|
||
new_inst = raw_output[start_index:end_index].strip()
|
||
generated_instructions_raw.append(new_inst)
|
||
else:
|
||
assert optimizer_llm_name.lower() == "text-bison"
|
||
generated_instructions_raw += [
|
||
extract_string_in_square_brackets(string)
|
||
for string in raw_outputs
|
||
]
|
||
|
||
remaining_num_instructions_to_generate -= optimizer_llm_dict[
|
||
"batch_size"
|
||
]
|
||
else:
|
||
assert meta_prompt_type == "instructions_only"
|
||
max_num_instructions_to_keep_in_each_output = 1
|
||
for string in raw_outputs:
|
||
generated_instructions_raw += parse_tag_content(string)[
|
||
:max_num_instructions_to_keep_in_each_output
|
||
]
|
||
remaining_num_instructions_to_generate -= (
|
||
optimizer_llm_dict["batch_size"]
|
||
* max_num_instructions_to_keep_in_each_output
|
||
)
|
||
|
||
generated_instructions_raw = list(
|
||
map(eval_utils.polish_sentence, generated_instructions_raw)
|
||
)
|
||
print(f"\ninitially generated instructions: {generated_instructions_raw}\n")
|
||
|
||
# do not evaluate old instructions again
|
||
generated_instructions = [] # the new instructions generated in this step
|
||
for ins in generated_instructions_raw:
|
||
ins_md5_hashstring = eval_utils.instruction_to_filename(
|
||
ins, md5_hashing=True
|
||
)
|
||
if ins_md5_hashstring not in old_instruction_md5_hashstrings_set:
|
||
generated_instructions.append(ins)
|
||
old_instruction_md5_hashstrings_set.add(ins_md5_hashstring)
|
||
else:
|
||
print(f"already evaluated '{ins}' previously")
|
||
generated_instructions = list(set(generated_instructions))
|
||
|
||
to_evaluate_instructions = []
|
||
for instruction in generated_instructions:
|
||
if len(instruction) > 500:
|
||
print(f"Step {i_step}, instruction: {instruction}, too long, skipped")
|
||
continue
|
||
if dataset_name == "gsm8k" and any(
|
||
char.isdigit() for char in instruction
|
||
):
|
||
print(
|
||
f"Step {i_step}, instruction: {instruction}, contains numbers,"
|
||
" skipped"
|
||
)
|
||
continue
|
||
if "INS" in instruction:
|
||
print(
|
||
f"Step {i_step}, instruction: {instruction}, contains 'INS',"
|
||
" skipped"
|
||
)
|
||
continue
|
||
to_evaluate_instructions.append(instruction)
|
||
print(f"\nto-evaluate generated instructions: {to_evaluate_instructions}\n")
|
||
|
||
# evaluate new instructions on the few-shot exemplars in meta-prompt
|
||
if few_shot_qa_pairs and evaluate_generated_ins_on_few_shot:
|
||
print("evaluating GENERATED instructions on few-shot exemplars")
|
||
single_step_eval_on_few_shot = dict()
|
||
for instruction in to_evaluate_instructions:
|
||
if instruction not in prev_saved_instructions:
|
||
print(
|
||
f"evaluating Step {i_step}, instruction: {instruction} on"
|
||
" few-shot exemplars"
|
||
)
|
||
detailed_results_df = eval_utils.evaluate_single_instruction(
|
||
data=raw_data,
|
||
instruction=instruction,
|
||
eval_index_all=few_shot_index_list,
|
||
batch_size=batch_size,
|
||
call_server_func=call_scorer_server_func,
|
||
dataset_name=dataset_name,
|
||
num_servers=num_servers,
|
||
extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
|
||
include_qa=include_qa,
|
||
evaluate_in_parallel=evaluate_in_parallel,
|
||
instruction_pos=instruction_pos,
|
||
is_multiple_choice=is_multiple_choice,
|
||
prediction_treat_as_number=prediction_treat_as_number,
|
||
prediction_treat_as_bool=prediction_treat_as_bool,
|
||
prediction_num_decimals=0,
|
||
max_retry=5,
|
||
sleep_time=180,
|
||
verbose=verbose,
|
||
)
|
||
single_step_eval_on_few_shot[instruction] = detailed_results_df
|
||
|
||
print(
|
||
f"Step {i_step}, single_step_eval_on_few_shot:"
|
||
f" {single_step_eval_on_few_shot}\n"
|
||
)
|
||
generated_ins_on_few_shot_results_dict[i_step] = (
|
||
single_step_eval_on_few_shot
|
||
)
|
||
|
||
# evaluate OLD instructions on the few-shot exemplars in meta-prompt
|
||
if few_shot_qa_pairs and evaluate_old_ins_on_few_shot:
|
||
print("evaluating OLD instructions on few-shot exemplars")
|
||
single_step_eval_on_few_shot = dict()
|
||
for instruction, _, _ in old_instructions_and_scores:
|
||
print(
|
||
f"evaluating Step {i_step}, instruction: {instruction} on few-shot"
|
||
" exemplars"
|
||
)
|
||
detailed_results_df = eval_utils.evaluate_single_instruction(
|
||
data=raw_data,
|
||
instruction=instruction,
|
||
eval_index_all=few_shot_index_list,
|
||
batch_size=scorer_llm_dict["batch_size"],
|
||
call_server_func=call_scorer_server_func,
|
||
dataset_name=dataset_name,
|
||
num_servers=scorer_llm_dict["num_servers"],
|
||
extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
|
||
include_qa=include_qa,
|
||
evaluate_in_parallel=evaluate_in_parallel,
|
||
instruction_pos=instruction_pos,
|
||
is_multiple_choice=is_multiple_choice,
|
||
prediction_treat_as_number=prediction_treat_as_number,
|
||
prediction_treat_as_bool=prediction_treat_as_bool,
|
||
prediction_num_decimals=0,
|
||
max_retry=5,
|
||
sleep_time=180,
|
||
verbose=verbose,
|
||
)
|
||
single_step_eval_on_few_shot[instruction] = detailed_results_df
|
||
|
||
print(
|
||
f"Step {i_step}, single_step_eval_on_few_shot:"
|
||
f" {single_step_eval_on_few_shot}\n"
|
||
)
|
||
old_ins_on_few_shot_results_dict[i_step] = single_step_eval_on_few_shot
|
||
|
||
# evaluate newly generated instructions on the training set
|
||
for instruction in to_evaluate_instructions:
|
||
if instruction not in prev_saved_instructions:
|
||
print(f"""computing the score of "{instruction}" by prompting""")
|
||
detailed_results_df = eval_utils.evaluate_single_instruction(
|
||
data=raw_data,
|
||
instruction=instruction,
|
||
eval_index_all=train_index,
|
||
batch_size=batch_size,
|
||
call_server_func=call_scorer_server_func,
|
||
dataset_name=dataset_name,
|
||
num_servers=num_servers,
|
||
extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
|
||
include_qa=include_qa,
|
||
evaluate_in_parallel=evaluate_in_parallel,
|
||
instruction_pos=instruction_pos,
|
||
is_multiple_choice=is_multiple_choice,
|
||
prediction_treat_as_number=prediction_treat_as_number,
|
||
prediction_treat_as_bool=prediction_treat_as_bool,
|
||
prediction_num_decimals=0,
|
||
max_retry=5,
|
||
sleep_time=180,
|
||
verbose=verbose,
|
||
)
|
||
prev_saved_instructions.add(instruction)
|
||
else:
|
||
# do not re-evaluate instructions that had been evaluated previously
|
||
detailed_results_df = pd.read_csv(
|
||
os.path.join(result_by_instruction_folder, f"{instruction}.csv"),
|
||
index_col=0,
|
||
header=0,
|
||
)
|
||
print(f"""reading previously saved "{instruction}" information""")
|
||
|
||
scores = detailed_results_df["accuracy"]
|
||
average_score = np.average(scores)
|
||
print(
|
||
f"Step {i_step}, instruction: {instruction}, score: {average_score}"
|
||
)
|
||
|
||
# increment the counter on wrong questions
|
||
wrong_question_indices_set = set(
|
||
list(
|
||
detailed_results_df[detailed_results_df["accuracy"] == 0.0].index
|
||
)
|
||
)
|
||
for idx in wrong_question_indices_set:
|
||
wrong_questions_from_start_counter[idx] += 1
|
||
|
||
filename = eval_utils.instruction_to_filename(instruction)
|
||
file_path = os.path.join(
|
||
result_by_instruction_folder, f"""{filename}.csv"""
|
||
)
|
||
detailed_results_df.to_csv(file_path, index=True, header=True)
|
||
print(f"saving results to {file_path}")
|
||
|
||
detailed_results_df_by_instruction_dict[instruction] = detailed_results_df
|
||
old_instructions_and_scores.append((instruction, average_score, i_step))
|
||
instruction_score_dict[instruction] = average_score
|
||
|
||
# record all generated instructions
|
||
for instruction in generated_instructions_raw:
|
||
if instruction in instruction_score_dict:
|
||
average_score = instruction_score_dict[instruction]
|
||
else:
|
||
average_score = np.nan
|
||
old_instructions_and_scores_raw.append(
|
||
(instruction, average_score, i_step)
|
||
)
|
||
|
||
# =============================== eval ====================================
|
||
# every eval_interval steps, evaluate the instructions that were generated
|
||
# in the current step and were not skipped
|
||
if not i_step % eval_interval:
|
||
for instruction in generated_instructions_raw:
|
||
# if the instruction wasn't skipped in any step
|
||
if instruction in instruction_score_dict:
|
||
if instruction not in instruction_eval_score_dict:
|
||
detailed_results_df = eval_utils.evaluate_single_instruction(
|
||
data=raw_data,
|
||
instruction=instruction,
|
||
eval_index_all=eval_index,
|
||
batch_size=batch_size,
|
||
call_server_func=call_scorer_server_func,
|
||
dataset_name=dataset_name,
|
||
num_servers=num_servers,
|
||
extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
|
||
include_qa=include_qa,
|
||
evaluate_in_parallel=evaluate_in_parallel,
|
||
instruction_pos=instruction_pos,
|
||
is_multiple_choice=is_multiple_choice_eval,
|
||
prediction_treat_as_number=prediction_treat_as_number,
|
||
prediction_treat_as_bool=prediction_treat_as_bool,
|
||
prediction_num_decimals=0,
|
||
max_retry=5,
|
||
sleep_time=180,
|
||
verbose=verbose,
|
||
)
|
||
eval_score = np.average(detailed_results_df["accuracy"])
|
||
eval_detailed_results_df_dict[instruction] = detailed_results_df
|
||
instruction_eval_score_dict[instruction] = eval_score
|
||
else:
|
||
eval_score = instruction_eval_score_dict[instruction]
|
||
print(
|
||
f"EVAL: \nStep {i_step}, instruction: {instruction}, eval score:"
|
||
f" {eval_score:.2f}"
|
||
)
|
||
eval_results.append((i_step, instruction, eval_score))
|
||
|
||
# ===================== save up-to-date results ===========================
|
||
results_dict = dict()
|
||
results_dict["meta_prompts"] = meta_prompts
|
||
results_dict["old_instructions_and_scores"] = list(
|
||
old_instructions_and_scores
|
||
)
|
||
results_dict["old_instructions_and_scores_raw"] = list(
|
||
old_instructions_and_scores_raw
|
||
)
|
||
results_dict["generated_ins_on_few_shot_results_dict"] = (
|
||
generated_ins_on_few_shot_results_dict
|
||
)
|
||
results_dict["old_ins_on_few_shot_results_dict"] = (
|
||
old_ins_on_few_shot_results_dict
|
||
)
|
||
results_dict["few_shot_index_list_by_step_dict"] = (
|
||
few_shot_index_list_by_step_dict
|
||
)
|
||
results_dict["eval_results"] = eval_results
|
||
results_dict["eval_detailed_results_df_dict"] = (
|
||
eval_detailed_results_df_dict
|
||
)
|
||
with open(os.path.join(save_folder, "results_dict.pkl"), "wb") as fp:
|
||
pickle.dump(results_dict, fp)
|
||
print(f"\nsaved all results to\n{save_folder}")
|