opro_demo/optimization/opt_utils.py

# Copyright 2023 The OPRO Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The utility functions for prompt optimization."""

import collections
import json
import os
import pickle
import re
import sys

OPRO_ROOT_PATH = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
)
sys.path.insert(0, OPRO_ROOT_PATH)

import numpy as np
from opro.evaluation import eval_utils
import pandas as pd


def extract_string_in_square_brackets(input_string):
  raw_result = re.findall(r"\[.*?\]", input_string)
  if raw_result:
    return raw_result[0][1:-1]
  else:
    return "" #从输入的字符串 input_string中提取第一个被方括号 []包裹的内容，并返回去掉方括号后的结果。如果输入字符串中没有方括号，则返回空字符串 ""。


def parse_tag_content(text, prefix="<TEXT>", suffix="</TEXT>"):
  pattern = f"{prefix}(.*?){suffix}"
  results = re.findall(pattern, text, re.DOTALL)
  return results


def _bucketize_float(num, n_buckets=20):
  assert num >= 0 and num <= 1, "The given number must be between 0 and 1."
  return round(num * n_buckets) #此函数可能用于将指令的评分（如 0.85）转换为整数分桶


def gen_ins_and_score_pairs_substr(
    old_instructions_and_scores,
    old_instruction_score_threshold=0.1,
    max_num_instructions=1000,
    return_str_only=False,
    num_score_buckets=np.inf,
):
  """Generate the string that includes instruction-score pairs."""
  assert num_score_buckets == np.inf or isinstance(num_score_buckets, int)
  old_instructions_and_scores_str = ""
  old_instructions_and_scores = sorted(
      old_instructions_and_scores, key=lambda x: x[1]
  )[-max_num_instructions:]
  old_instructions_and_scores_in_meta_prompt = []
  for instruction, score, i_step in old_instructions_and_scores:
    if (
        not old_instruction_score_threshold
        or score >= old_instruction_score_threshold
    ):
      old_instructions_and_scores_in_meta_prompt.append(
          (instruction, score, i_step)
      )
      if num_score_buckets == np.inf:
        score_to_show = round(score, 3)
      else:
        score_to_show = _bucketize_float(score, num_score_buckets)
      old_instructions_and_scores_str += (
          f"\ntext:\n{instruction}\nscore:\n{score_to_show}\n"
      )
  if return_str_only:
    return old_instructions_and_scores_str
  else:
    return (
        old_instructions_and_scores_str,
        old_instructions_and_scores_in_meta_prompt,
    )


def gen_meta_prompt(
    old_instructions_and_scores,
    instruction_pos,
    optimizer_llm_name,
    old_instruction_score_threshold=0.1,
    max_num_instructions=1000,
    meta_prompt_type="both_instructions_and_exemplars",
    few_shot_qa_pairs=False,
    include_qa=True,
    data=None,
    few_shot_index_list=None,
    instructions_before_exemplars=True,
    num_score_buckets=np.inf,
    dataset_name="",
    task_name="",
):
  """Generate meta prompt for instruction rewriting.

  Args:
   old_instructions_and_scores (list): a list of (instruction, score, i_step)
     pairs.
   instruction_pos (str): where to put the instruction, one of {'before_QA',
     'Q_begin', 'Q_end', 'A_begin'}.
   optimizer_llm_name (str): the name of the LLM used for instruction editing.
   old_instruction_score_threshold (float): only add old instructions with score
     no less than this threshold.
   max_num_instructions (int): the maximum number of instructions in the meta
     prompt.
   meta_prompt_type (str): the type of meta-prompt: whether to have both
     previous instructions and dataset exemplars (often for fine-tuned
     optimizers), or to have only previous instructions (often for pre-trained
     optimizers).
   few_shot_qa_pairs (bool): whether to have few-shot QA pairs in the meta
     prompt.
   include_qa (bool): whether to include "Q:" and "A:" formats in the prompt.
   data (list or pd.DataFrame): the raw data.
   few_shot_index_list (list): the list of indices of few-shot examples.
   instructions_before_exemplars (bool): whether the instruction-score pairs are
     before the exemplars from the dataset.
   num_score_buckets (np.inf or int): the number of score buckets when we
     convert float accuracies to integers. Default to np.inf for not
     bucketizing.
   dataset_name (str): the name of the current dataset. Only used when
     generating task description when meta_prompt_type == "instructions_only".
   task_name (str): the name of the current task. Only used when generating task
     description when meta_prompt_type == "instructions_only".

  Returns:
   meta_prompt (str): the generated meta prompt.
  """
  assert instruction_pos in {
      "before_Q",
      "Q_begin",
      "Q_end",
      "A_begin",
  }, (
      "The instruction position should be either before the question, or at the"
      " beginning of the question, at the end of the question, or at the"
      " beginning of the answer."
  )
  assert meta_prompt_type in {
      "both_instructions_and_exemplars",
      "instructions_only",
  }
  assert dataset_name in {
      "mmlu",
      "bbh",
      "gsm8k",
  }, "The lower-case dataset name must be one of mmlu, bbh, gsm8k."
  assert num_score_buckets == np.inf or isinstance(num_score_buckets, int)

  meta_prompt = ""
  if meta_prompt_type == "both_instructions_and_exemplars":
    if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4","local"}: # GPT模型的提示模板
      if instruction_pos == "A_begin":# 针对答案开头部分的描述 
        meta_prompt_old_instruction_part = (
            "Your task is to generate the answer starting sentence <Start>."
            " Below are some previous starting sentences with their scores."
            " The score ranges from 0 to 100.\n"
        )#要求模型生成答案的起始句（如“The answer is...”），并参考历史评分数据。
      else:# 针对普通指令的描述
        meta_prompt_old_instruction_part = (
            "Your task is to generate the instruction <INS>."
            " Below are some previous instructions with their scores."
            " The score ranges from 0 to 100.\n"
        )
    else: # text-bison 模型的专用提示模板
      assert optimizer_llm_name.lower() == "text-bison"
      meta_prompt_old_instruction_part = (
          "I have some texts along with their corresponding scores."
          " The texts are arranged in ascending order based on their scores,"
          " where higher scores indicate better quality.\n\n"
      )
    # add old instructions
    old_instructions_and_scores_str = gen_ins_and_score_pairs_substr(
        old_instructions_and_scores=old_instructions_and_scores,
        old_instruction_score_threshold=old_instruction_score_threshold,
        max_num_instructions=max_num_instructions,
        return_str_only=True,
        num_score_buckets=num_score_buckets,
    )
    meta_prompt_old_instruction_part += old_instructions_and_scores_str
    # add QA pairs if few_shot_qa_pairs == True
    meta_prompt_exemplar_part = ""
    if few_shot_qa_pairs:
      if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
        meta_prompt_exemplar_part += "Below are some problems.\n"
      else:
        assert optimizer_llm_name.lower() == "text-bison"
        meta_prompt_exemplar_part += (
            "The following exemplars show how to apply your text: you replace"
            " <INS> in each input with your text, then read the input and give"
            " an output. We say your output is wrong if your output is"
            " different from the given output, and we say your output is"
            " correct if they are the same. When replacing <INS> with an old"
            " piece of text above, we get wrong outputs on the following"
            " inputs.\n\n"
        )
      for idx in few_shot_index_list:
        if dataset_name == "mmlu":
          question = eval_utils._format_mmlu_example(data, idx)  # pylint: disable=protected-access
          true_answer = data.iloc[idx, -1]
        elif dataset_name == "bbh":
          question = data[idx]["input"]
          true_answer = data[idx]["target"]
        else:
          assert dataset_name == "gsm8k"
          question = data.iloc[idx, 0]
          true_answer = data.iloc[idx, 1]

        if include_qa:  # when "Q:" and "A:" are present in the prompt
          if instruction_pos == "before_Q":
            meta_prompt_exemplar_part += f"\ninput:\n<INS>\nQ: {question}\nA:"
          elif instruction_pos == "Q_begin":
            meta_prompt_exemplar_part += f"\ninput:\nQ: <INS>\n{question}\nA:"
          elif instruction_pos == "Q_end":
            meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\n<INS>\nA:"
          else:  # instruction_pos == "A_begin"
            if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
              meta_prompt_exemplar_part += f"\nQ: {question}\nA: <Start>"
            else:
              assert optimizer_llm_name.lower() == "text-bison"
              meta_prompt_exemplar_part += f"\ninput:\nQ: {question}\nA: <INS>"
        else:  # when there're no "Q:" and "A:" in the prompt
          assert instruction_pos in {"Q_begin", "Q_end"}
          if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
            if instruction_pos == "Q_begin":
              meta_prompt_exemplar_part += f"\nProblem:\n<INS>\n{question}\n"
            elif instruction_pos == "Q_end":
              meta_prompt_exemplar_part += f"\nProblem:\n{question}\n<INS>\n"
          else:
            assert optimizer_llm_name.lower() == "text-bison"
            if instruction_pos == "Q_begin":
              meta_prompt_exemplar_part += f"\ninput:\n<INS>\n{question}\n"
            elif instruction_pos == "Q_end":
              meta_prompt_exemplar_part += f"\ninput:\n{question}\n<INS>\n"

        if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
          meta_prompt_exemplar_part += (
              f"\nGround truth answer:\n{true_answer}\n"
          )
        else:
          assert optimizer_llm_name.lower() == "text-bison"
          meta_prompt_exemplar_part += f"\noutput:\n{true_answer}\n"

    if few_shot_qa_pairs:
      if instructions_before_exemplars:
        meta_prompt += (
            meta_prompt_old_instruction_part
            + "\n\n"
            + meta_prompt_exemplar_part
        )
      else:
        meta_prompt += (
            meta_prompt_exemplar_part
            + "\n\n"
            + meta_prompt_old_instruction_part
        )
    else:
      meta_prompt += meta_prompt_old_instruction_part

    if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
      if instruction_pos == "A_begin":
        meta_prompt += (
            "\n\nGenerate a starting sentence that is different from all the"
            " <Start> sentences above, and has a higher score than all the"
            " <Start> sentences above. The starting sentence should begin with"
            " <Start> and end with </Start>. The starting sentence should be"
            " concise, effective, and generally applicable to all QA pairs"
            " above."
        )
      else:
        meta_prompt += (
            "\n\nGenerate an instruction that"
            " is different from all the instructions <INS> above,"
            " and has a higher score than all the instructions <INS> above."
            " The instruction should begin with <INS> and end with </INS>."
            " The instruction should be concise, effective,"
            " and generally applicable to all problems above."
        )
    else:
      assert optimizer_llm_name.lower() == "text-bison"
      meta_prompt += (
          "\n\nWrite your new text that is different from the old ones and"
          " has a score as high as possible. Write the text in square brackets."
      )
  else:
    # when using a pre-trained model as optimizer
    assert meta_prompt_type == "instructions_only"

    assert instruction_pos in {"Q_begin", "Q_end", "A_begin"}
    if instruction_pos == "Q_begin":
      instruction_pos_description = "at the beginning of the question"
    elif instruction_pos == "Q_end":
      instruction_pos_description = "at the end of the question"
    else:
      assert instruction_pos == "A_begin"
      instruction_pos_description = "at the beginning of the answer"

    if dataset_name == "gsm8k":
      instruction_task_description = "grade school math"
    elif dataset_name == "mmlu":
      instruction_task_description = task_name
    else:
      assert dataset_name == "bbh"
      instruction_task_description = " ".join(task_name.split("_"))

    meta_instruction = (
        f"Create a piece of text {instruction_pos_description.strip()} to"
        " enhance the precision in solving diverse"
        f" {instruction_task_description.strip()} problems."
    )
    old_instructions_and_scores = sorted(
        old_instructions_and_scores, key=lambda x: x[1]
    )
    old_instructions_and_scores_str = ""
    for instruction, score, _ in old_instructions_and_scores:
      if num_score_buckets == np.inf:
        score_to_show = round(score, 2)
      else:
        score_to_show = _bucketize_float(score, num_score_buckets)
      old_instructions_and_scores_str += (
          f"\n\nPrecision: {score_to_show} <TEXT>{instruction}</TEXT>"
      )
    meta_prompt += meta_instruction + old_instructions_and_scores_str
  return meta_prompt


def run_evolution(**kwargs):
  """The function for evolution."""
  # ================= experiment configurations =============================
  num_search_steps = kwargs["num_search_steps"]
  old_instruction_score_threshold = kwargs["old_instruction_score_threshold"]
  scorer_llm_dict = kwargs["scorer_llm_dict"]
  optimizer_llm_dict = kwargs["optimizer_llm_dict"]
  extract_final_answer_by_prompting_again = kwargs[
      "extract_final_answer_by_prompting_again"
  ]
  include_qa = kwargs["include_qa"]
  evaluate_in_parallel = kwargs["evaluate_in_parallel"]
  tasks_all = kwargs["tasks_all"]
  train_ratio = kwargs["train_ratio"]
  eval_ratio = kwargs["eval_ratio"]
  test_ratio = kwargs["test_ratio"]
  train_index = kwargs["train_index"]
  eval_index = kwargs["eval_index"]
  dataset_name = kwargs["dataset_name"]
  task_name = kwargs["task_name"]
  num_examples = kwargs["num_examples"]
  root_data_folder_path = kwargs["root_data_folder_path"]
  optimizer_llm_temperature = kwargs["optimizer_llm_temperature"]
  optimizer_llm_temperature_schedule = (
      kwargs["optimizer_llm_temperature_schedule"]
      if "optimizer_llm_temperature_schedule" in kwargs
      else "constant"
  )
  optimizer_llm_temperature_end = (
      kwargs["optimizer_llm_temperature_end"]
      if "optimizer_llm_temperature_end" in kwargs
      else None
  )
  initial_instructions = kwargs["initial_instructions"]
  multiple_choice_tasks = kwargs["multiple_choice_tasks"]
  raw_data = kwargs["raw_data"]
  call_scorer_server_func = kwargs["call_scorer_server_func"]
  call_optimizer_server_func = kwargs["call_optimizer_server_func"]
  instruction_pos = kwargs["instruction_pos"]
  prediction_treat_as_number = kwargs["prediction_treat_as_number"]
  prediction_treat_as_bool = kwargs["prediction_treat_as_bool"]
  result_by_instruction_folder = kwargs["result_by_instruction_folder"]
  few_shot_qa_pairs = kwargs["few_shot_qa_pairs"]
  num_score_buckets = kwargs["num_score_buckets"]
  max_num_instructions = kwargs["max_num_instructions"]
  meta_prompt_type = kwargs["meta_prompt_type"]
  meta_prompt_instructions_before_exemplars = kwargs[
      "meta_prompt_instructions_before_exemplars"
  ]
  few_shot_selection_criteria = kwargs["few_shot_selection_criteria"]
  optimizer_llm_name = kwargs["optimizer_llm_name"]
  num_generated_instructions_in_each_step = kwargs[
      "num_generated_instructions_in_each_step"
  ]
  evaluate_generated_ins_on_few_shot = kwargs[
      "evaluate_generated_ins_on_few_shot"
  ]
  num_few_shot_questions_for_instruction_refinement = kwargs[
      "num_few_shot_questions_for_instruction_refinement"
  ]
  evaluate_old_ins_on_few_shot = kwargs["evaluate_old_ins_on_few_shot"]
  eval_interval = kwargs["eval_interval"]
  save_folder = kwargs["save_folder"]
  verbose = kwargs["verbose"] if "verbose" in kwargs else False

  # =================== assertions =====================
  assert dataset_name in {
      "mmlu",
      "bbh",
      "gsm8k",
  }, "The lower-case dataset name must be one of mmlu, bbh, gsm8k."
  assert optimizer_llm_temperature_schedule in {
      "constant",
      "linear_increase",
  }, "The temperature schedule should be constant or linear_increase."

  # =================== save configurations to json file ====================
  configs_dict = dict()
  configs_dict["scorer_llm_dict"] = scorer_llm_dict
  configs_dict["optimizer_llm_dict"] = optimizer_llm_dict
  configs_dict["instruction_pos"] = instruction_pos
  configs_dict["optimizer_llm_temperature"] = optimizer_llm_temperature
  configs_dict["optimizer_llm_temperature_schedule"] = (
      optimizer_llm_temperature_schedule
  )
  configs_dict["optimizer_llm_temperature_end"] = optimizer_llm_temperature_end
  with open(os.path.join(save_folder, "configs_dict.json"), "w") as f:
    json.dump(configs_dict, f, indent=4)

  num_servers = scorer_llm_dict["num_servers"]
  batch_size = scorer_llm_dict["batch_size"]
  generated_ins_on_few_shot_results_dict = dict()
  old_ins_on_few_shot_results_dict = dict()
  # evaluation results every a few steps
  # format: [(i_step, instruction, detailed_results_df)]
  eval_results = []
  # all generated instructions, format: [(instruction, score, step_index)]
  # the instructions that were skipped have score NaN
  old_instructions_and_scores_raw = []
  # the new instructions, format: [(instruction, score, step_index)]
  old_instructions_and_scores = []
  meta_prompts = []  # format: [(meta_prompt, step_index)]
  instruction_score_dict = dict()  # the dictionary of {instruction: score}
  # the dictionary of the few-shot QA indices in meta-prompt
  # key: step index; value: the list of few-shot indices in that step
  few_shot_index_list_by_step_dict = dict()
  detailed_results_df_by_instruction_dict = dict()
  wrong_questions_from_start_counter = collections.Counter()
  # EVAL results
  eval_detailed_results_df_dict = dict()  # {instruction: detailed_results_df}
  instruction_eval_score_dict = dict()  # {instruction: eval_score}
  old_instruction_md5_hashstrings_set = set()

  print(f"tasks_all: {tasks_all}")
  print(
      f"train_ratio: {train_ratio}, number of training points:"
      f" {int(num_examples * train_ratio)}"
  )
  print(
      f"eval_ratio: {eval_ratio}, number of eval points: "
      f"{int(num_examples * eval_ratio)}"
  )
  print(
      f"test_ratio: {test_ratio}, number of test points: "
      f"{int(num_examples * test_ratio)}"
  )
  print(
      f"optimizer llm temperature: {optimizer_llm_temperature}, schedule:"
      f" {optimizer_llm_temperature_schedule}"
  )
  print(
      f"generating {num_generated_instructions_in_each_step} instructions in"
      f" each step, run for {num_search_steps} steps"
  )
  print(
      "discarding generated instructions with score less than:"
      f" {old_instruction_score_threshold} (old_instruction_score_threshold)"
  )
  print(f"num_score_buckets: {num_score_buckets}")

  if dataset_name == "mmlu":
    is_multiple_choice = True
    is_multiple_choice_eval = True
  elif dataset_name in {"gsm8k"}:
    is_multiple_choice = False
    is_multiple_choice_eval = False
  else:
    assert dataset_name == "bbh"
    is_multiple_choice = []
    is_multiple_choice_eval = []
    train_index_by_task_dict = dict()
    eval_index_by_task_dict = dict()
    start_index = 0
    for task_name in tasks_all:
      single_task_list = eval_utils.load_bbh_task_data(
          task_name, base_dir=root_data_folder_path
      )
      end_index = start_index + len(single_task_list)
      train_index_by_task_dict[task_name] = (
          train_index[(train_index >= start_index) & (train_index < end_index)]
          # if " - start_index" is added here, then the dict would contain
          # indices in the original task
      )
      eval_index_by_task_dict[task_name] = (
          eval_index[(eval_index >= start_index) & (eval_index < end_index)]
          # if " - start_index" is added here, then the dict would contain
          # indices in the original task
      )
      start_index = end_index
      is_multiple_choice_single_task_train = [
          task_name in multiple_choice_tasks
      ] * len(train_index_by_task_dict[task_name])
      is_multiple_choice_single_task_eval = [
          task_name in multiple_choice_tasks
      ] * len(eval_index_by_task_dict[task_name])
      is_multiple_choice += is_multiple_choice_single_task_train
      is_multiple_choice_eval += is_multiple_choice_single_task_eval

  prev_saved_instructions = set()

  # evaluate initial instructions
  print("\n============== evaluating initial instructions ===============")
  for instruction in initial_instructions:
    print(f"""computing the score of "{instruction}" by prompting""")

    detailed_results_df = eval_utils.evaluate_single_instruction(
        data=raw_data,
        instruction=instruction,
        eval_index_all=train_index,
        batch_size=batch_size,
        call_server_func=call_scorer_server_func,
        dataset_name=dataset_name,
        num_servers=num_servers,
        extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
        include_qa=include_qa,
        evaluate_in_parallel=evaluate_in_parallel,
        instruction_pos=instruction_pos,
        is_multiple_choice=is_multiple_choice,
        prediction_treat_as_number=prediction_treat_as_number,
        prediction_treat_as_bool=prediction_treat_as_bool,
        prediction_num_decimals=0,
        max_retry=120,
        sleep_time=60,
        verbose=verbose,
    )

    detailed_results_df_by_instruction_dict[instruction] = detailed_results_df
    scores = detailed_results_df["accuracy"]
    average_score = np.average(scores)
    print(f"instruction: {instruction}, score: {average_score}")
    filename = eval_utils.instruction_to_filename(instruction)
    file_path = os.path.join(result_by_instruction_folder, f"{filename}.csv")
    detailed_results_df.to_csv(file_path, index=True, header=True)
    print(f"""saving results of "{instruction}" to {file_path}""")
    old_instructions_and_scores.append((instruction, average_score, -1))
    old_instructions_and_scores_raw.append((instruction, average_score, -1))
    instruction_score_dict[instruction] = average_score

    # increment the counter on wrong questions
    wrong_question_indices_set = set(
        list(
            detailed_results_df.iloc[
                np.where(detailed_results_df.accuracy == 0.0)[0], :
            ].index
        )
    )
    for idx in wrong_question_indices_set:
      wrong_questions_from_start_counter[idx] += 1

  # evolution
  for i_step in range(num_search_steps):
    print(f"\n================== Step {i_step} =====================")
    if not i_step % 10:
      print(f"old_instructions_and_scores: {old_instructions_and_scores}")

    if optimizer_llm_temperature_schedule == "linear_increase":
      optimizer_llm_temperature_curr = (
          optimizer_llm_temperature
          + i_step
          / num_search_steps
          * (optimizer_llm_temperature_end - optimizer_llm_temperature)
      )
    else:
      optimizer_llm_temperature_curr = optimizer_llm_temperature
    print(
        f"current optimizer_llm_temperature: {optimizer_llm_temperature_curr}"
    )

    # generate new instructions
    if few_shot_qa_pairs:
      if few_shot_selection_criteria == "accumulative_most_frequent":
        # select QA pairs that were done wrong the most number of times
        most_frequent_wrong_question_indices = [
            k
            for k, _ in sorted(
                wrong_questions_from_start_counter.items(), key=lambda x: -x[1]
            )
        ]
        print(
            "len(most_frequent_wrong_question_indices):"
            f" {len(most_frequent_wrong_question_indices)}"
        )
        if (
            len(most_frequent_wrong_question_indices)
            <= num_few_shot_questions_for_instruction_refinement
        ):
          few_shot_index_list = most_frequent_wrong_question_indices.copy()
        else:
          np.random.seed(i_step)
          few_shot_index_list = np.sort(
              np.random.choice(
                  most_frequent_wrong_question_indices,
                  num_few_shot_questions_for_instruction_refinement,
                  replace=False,
              )
          )

      elif few_shot_selection_criteria == "current_most_frequent":
        # show exemplars done wrong most often by currently shown instructions
        old_instruction_score_threshold_single_step = (
            old_instruction_score_threshold if i_step > 0 else 0
        )
        _, old_instructions_and_scores_in_meta_prompt = (
            gen_ins_and_score_pairs_substr(
                old_instructions_and_scores=old_instructions_and_scores,
                old_instruction_score_threshold=old_instruction_score_threshold_single_step,
                max_num_instructions=max_num_instructions,
                return_str_only=False,
                num_score_buckets=num_score_buckets,
            )
        )
        wrong_questions_counter_single_step = collections.Counter()
        for ins, _, _ in old_instructions_and_scores_in_meta_prompt:
          filename = eval_utils.instruction_to_filename(ins)
          file_path = os.path.join(
              result_by_instruction_folder, f"{filename}.csv"
          )
          single_ins_df = pd.read_csv(file_path, index_col=0, header=0)
          wrong_question_indices_set_single_old_ins = set(
              list(
                  single_ins_df.iloc[
                      np.where(single_ins_df.accuracy == 0.0)[0], :
                  ].index
              )
          )
          for idx in wrong_question_indices_set_single_old_ins:
            wrong_questions_counter_single_step[idx] += 1
        most_occurred_wrong_questions = [
            k
            for k, v in wrong_questions_counter_single_step.items()
            if v == max(wrong_questions_counter_single_step.values())
        ]
        if (
            len(most_occurred_wrong_questions)
            < num_few_shot_questions_for_instruction_refinement
        ):
          # pylint: disable=cell-var-from-loop
          idx_most_to_least = sorted(
              wrong_questions_counter_single_step,
              key=lambda x: -wrong_questions_counter_single_step[x],
          )
          few_shot_index_list = idx_most_to_least[
              :num_few_shot_questions_for_instruction_refinement
          ]
        else:
          few_shot_index_list = np.sort(
              np.random.choice(
                  most_occurred_wrong_questions,
                  num_few_shot_questions_for_instruction_refinement,
                  replace=False,
              )
          )
      elif few_shot_selection_criteria == "constant":
        np.random.seed(0)
        few_shot_index_list = np.sort(
            np.random.choice(
                train_index,
                num_few_shot_questions_for_instruction_refinement,
                replace=False,
            )
        )
      else:
        assert few_shot_selection_criteria == "random"
        np.random.seed(i_step)
        few_shot_index_list = np.sort(
            np.random.choice(
                train_index,
                num_few_shot_questions_for_instruction_refinement,
                replace=False,
            )
        ).tolist()

      few_shot_index_list_by_step_dict[i_step] = few_shot_index_list

      meta_prompt = gen_meta_prompt(
          old_instructions_and_scores=old_instructions_and_scores,
          instruction_pos=instruction_pos,
          optimizer_llm_name=optimizer_llm_name,
          old_instruction_score_threshold=old_instruction_score_threshold,
          max_num_instructions=max_num_instructions,
          meta_prompt_type=meta_prompt_type,
          few_shot_qa_pairs=few_shot_qa_pairs,
          include_qa=include_qa,
          data=raw_data,
          few_shot_index_list=few_shot_index_list,
          instructions_before_exemplars=meta_prompt_instructions_before_exemplars,
          num_score_buckets=num_score_buckets,
          dataset_name=dataset_name,
          task_name=task_name,
      )

    else:  # no few-shot exemplars in meta-prompt
      few_shot_index_list = []
      meta_prompt = gen_meta_prompt(
          old_instructions_and_scores=old_instructions_and_scores,
          instruction_pos=instruction_pos,
          optimizer_llm_name=optimizer_llm_name,
          old_instruction_score_threshold=old_instruction_score_threshold,
          max_num_instructions=max_num_instructions,
          meta_prompt_type=meta_prompt_type,
          few_shot_qa_pairs=False,
          include_qa=include_qa,
          instructions_before_exemplars=meta_prompt_instructions_before_exemplars,
          num_score_buckets=num_score_buckets,
          dataset_name=dataset_name,
          task_name=task_name,
      )
    print(f"\nmeta_prompt: \n\n{meta_prompt}\n")
    meta_prompts.append((meta_prompt, i_step))
    remaining_num_instructions_to_generate = (
        num_generated_instructions_in_each_step
    )
    generated_instructions_raw = []
    while remaining_num_instructions_to_generate > 0:
      optimizer_llm_input_text = meta_prompt
      # generate instructions
      print(f"current temperature: {optimizer_llm_temperature_curr}")
      raw_outputs = call_optimizer_server_func(
          optimizer_llm_input_text,
          temperature=optimizer_llm_temperature_curr,
      )

      # Extract the generated instructions from the optimizer LLM output. Only
      # keep some samples if the desired number of remaining instructions
      # is smaller than the total number of decodes in this step.
      if meta_prompt_type == "both_instructions_and_exemplars":
        raw_outputs = raw_outputs[:remaining_num_instructions_to_generate]
        if optimizer_llm_name.lower() in {"gpt-3.5-turbo", "gpt-4", "local"}:
          if instruction_pos == "A_begin":
            start_string = "<Start>"
            end_string = "</Start>"
          else:
            start_string = "<INS>+"
            end_string = "</INS>"
          for raw_output in raw_outputs:
            if start_string not in raw_output:
              start_index = 0
            else:
              start_index = raw_output.index(start_string) + len(start_string)
            if end_string not in raw_output:
              end_index = len(raw_output)
            else:
              end_index = raw_output.index(end_string)
            new_inst = raw_output[start_index:end_index].strip()
            generated_instructions_raw.append(new_inst)
        else:
          assert optimizer_llm_name.lower() == "text-bison"
          generated_instructions_raw += [
              extract_string_in_square_brackets(string)
              for string in raw_outputs
          ]

        remaining_num_instructions_to_generate -= optimizer_llm_dict[
            "batch_size"
        ]
      else:
        assert meta_prompt_type == "instructions_only"
        max_num_instructions_to_keep_in_each_output = 1
        for string in raw_outputs:
          generated_instructions_raw += parse_tag_content(string)[
              :max_num_instructions_to_keep_in_each_output
          ]
        remaining_num_instructions_to_generate -= (
            optimizer_llm_dict["batch_size"]
            * max_num_instructions_to_keep_in_each_output
        )

    generated_instructions_raw = list(
        map(eval_utils.polish_sentence, generated_instructions_raw)
    )
    print(f"\ninitially generated instructions: {generated_instructions_raw}\n")

    # do not evaluate old instructions again
    generated_instructions = []  # the new instructions generated in this step
    for ins in generated_instructions_raw:
      ins_md5_hashstring = eval_utils.instruction_to_filename(
          ins, md5_hashing=True
      )
      if ins_md5_hashstring not in old_instruction_md5_hashstrings_set:
        generated_instructions.append(ins)
        old_instruction_md5_hashstrings_set.add(ins_md5_hashstring)
      else:
        print(f"already evaluated '{ins}' previously")
    generated_instructions = list(set(generated_instructions))

    to_evaluate_instructions = []
    for instruction in generated_instructions:
      if len(instruction) > 500:
        print(f"Step {i_step}, instruction: {instruction}, too long, skipped")
        continue
      if dataset_name == "gsm8k" and any(
          char.isdigit() for char in instruction
      ):
        print(
            f"Step {i_step}, instruction: {instruction}, contains numbers,"
            " skipped"
        )
        continue
      if "INS" in instruction:
        print(
            f"Step {i_step}, instruction: {instruction}, contains 'INS',"
            " skipped"
        )
        continue
      to_evaluate_instructions.append(instruction)
    print(f"\nto-evaluate generated instructions: {to_evaluate_instructions}\n")

    # evaluate new instructions on the few-shot exemplars in meta-prompt
    if few_shot_qa_pairs and evaluate_generated_ins_on_few_shot:
      print("evaluating GENERATED instructions on few-shot exemplars")
      single_step_eval_on_few_shot = dict()
      for instruction in to_evaluate_instructions:
        if instruction not in prev_saved_instructions:
          print(
              f"evaluating Step {i_step}, instruction: {instruction} on"
              " few-shot exemplars"
          )
          detailed_results_df = eval_utils.evaluate_single_instruction(
              data=raw_data,
              instruction=instruction,
              eval_index_all=few_shot_index_list,
              batch_size=batch_size,
              call_server_func=call_scorer_server_func,
              dataset_name=dataset_name,
              num_servers=num_servers,
              extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
              include_qa=include_qa,
              evaluate_in_parallel=evaluate_in_parallel,
              instruction_pos=instruction_pos,
              is_multiple_choice=is_multiple_choice,
              prediction_treat_as_number=prediction_treat_as_number,
              prediction_treat_as_bool=prediction_treat_as_bool,
              prediction_num_decimals=0,
              max_retry=5,
              sleep_time=180,
              verbose=verbose,
          )
          single_step_eval_on_few_shot[instruction] = detailed_results_df

      print(
          f"Step {i_step}, single_step_eval_on_few_shot:"
          f" {single_step_eval_on_few_shot}\n"
      )
      generated_ins_on_few_shot_results_dict[i_step] = (
          single_step_eval_on_few_shot
      )

    # evaluate OLD instructions on the few-shot exemplars in meta-prompt
    if few_shot_qa_pairs and evaluate_old_ins_on_few_shot:
      print("evaluating OLD instructions on few-shot exemplars")
      single_step_eval_on_few_shot = dict()
      for instruction, _, _ in old_instructions_and_scores:
        print(
            f"evaluating Step {i_step}, instruction: {instruction} on few-shot"
            " exemplars"
        )
        detailed_results_df = eval_utils.evaluate_single_instruction(
            data=raw_data,
            instruction=instruction,
            eval_index_all=few_shot_index_list,
            batch_size=scorer_llm_dict["batch_size"],
            call_server_func=call_scorer_server_func,
            dataset_name=dataset_name,
            num_servers=scorer_llm_dict["num_servers"],
            extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
            include_qa=include_qa,
            evaluate_in_parallel=evaluate_in_parallel,
            instruction_pos=instruction_pos,
            is_multiple_choice=is_multiple_choice,
            prediction_treat_as_number=prediction_treat_as_number,
            prediction_treat_as_bool=prediction_treat_as_bool,
            prediction_num_decimals=0,
            max_retry=5,
            sleep_time=180,
            verbose=verbose,
        )
        single_step_eval_on_few_shot[instruction] = detailed_results_df

      print(
          f"Step {i_step}, single_step_eval_on_few_shot:"
          f" {single_step_eval_on_few_shot}\n"
      )
      old_ins_on_few_shot_results_dict[i_step] = single_step_eval_on_few_shot

    # evaluate newly generated instructions on the training set
    for instruction in to_evaluate_instructions:
      if instruction not in prev_saved_instructions:
        print(f"""computing the score of "{instruction}" by prompting""")
        detailed_results_df = eval_utils.evaluate_single_instruction(
            data=raw_data,
            instruction=instruction,
            eval_index_all=train_index,
            batch_size=batch_size,
            call_server_func=call_scorer_server_func,
            dataset_name=dataset_name,
            num_servers=num_servers,
            extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
            include_qa=include_qa,
            evaluate_in_parallel=evaluate_in_parallel,
            instruction_pos=instruction_pos,
            is_multiple_choice=is_multiple_choice,
            prediction_treat_as_number=prediction_treat_as_number,
            prediction_treat_as_bool=prediction_treat_as_bool,
            prediction_num_decimals=0,
            max_retry=5,
            sleep_time=180,
            verbose=verbose,
        )
        prev_saved_instructions.add(instruction)
      else:
        # do not re-evaluate instructions that had been evaluated previously
        detailed_results_df = pd.read_csv(
            os.path.join(result_by_instruction_folder, f"{instruction}.csv"),
            index_col=0,
            header=0,
        )
        print(f"""reading previously saved "{instruction}" information""")

      scores = detailed_results_df["accuracy"]
      average_score = np.average(scores)
      print(
          f"Step {i_step}, instruction: {instruction}, score: {average_score}"
      )

      # increment the counter on wrong questions
      wrong_question_indices_set = set(
          list(
              detailed_results_df[detailed_results_df["accuracy"] == 0.0].index
          )
      )
      for idx in wrong_question_indices_set:
        wrong_questions_from_start_counter[idx] += 1

      filename = eval_utils.instruction_to_filename(instruction)
      file_path = os.path.join(
          result_by_instruction_folder, f"""{filename}.csv"""
      )
      detailed_results_df.to_csv(file_path, index=True, header=True)
      print(f"saving results to {file_path}")

      detailed_results_df_by_instruction_dict[instruction] = detailed_results_df
      old_instructions_and_scores.append((instruction, average_score, i_step))
      instruction_score_dict[instruction] = average_score

    # record all generated instructions
    for instruction in generated_instructions_raw:
      if instruction in instruction_score_dict:
        average_score = instruction_score_dict[instruction]
      else:
        average_score = np.nan
      old_instructions_and_scores_raw.append(
          (instruction, average_score, i_step)
      )

    # =============================== eval ====================================
    # every eval_interval steps, evaluate the instructions that were generated
    # in the current step and were not skipped
    if not i_step % eval_interval:
      for instruction in generated_instructions_raw:
        # if the instruction wasn't skipped in any step
        if instruction in instruction_score_dict:
          if instruction not in instruction_eval_score_dict:
            detailed_results_df = eval_utils.evaluate_single_instruction(
                data=raw_data,
                instruction=instruction,
                eval_index_all=eval_index,
                batch_size=batch_size,
                call_server_func=call_scorer_server_func,
                dataset_name=dataset_name,
                num_servers=num_servers,
                extract_final_answer_by_prompting_again=extract_final_answer_by_prompting_again,
                include_qa=include_qa,
                evaluate_in_parallel=evaluate_in_parallel,
                instruction_pos=instruction_pos,
                is_multiple_choice=is_multiple_choice_eval,
                prediction_treat_as_number=prediction_treat_as_number,
                prediction_treat_as_bool=prediction_treat_as_bool,
                prediction_num_decimals=0,
                max_retry=5,
                sleep_time=180,
                verbose=verbose,
            )
            eval_score = np.average(detailed_results_df["accuracy"])
            eval_detailed_results_df_dict[instruction] = detailed_results_df
            instruction_eval_score_dict[instruction] = eval_score
          else:
            eval_score = instruction_eval_score_dict[instruction]
          print(
              f"EVAL: \nStep {i_step}, instruction: {instruction}, eval score:"
              f" {eval_score:.2f}"
          )
          eval_results.append((i_step, instruction, eval_score))

    # ===================== save up-to-date results ===========================
    results_dict = dict()
    results_dict["meta_prompts"] = meta_prompts
    results_dict["old_instructions_and_scores"] = list(
        old_instructions_and_scores
    )
    results_dict["old_instructions_and_scores_raw"] = list(
        old_instructions_and_scores_raw
    )
    results_dict["generated_ins_on_few_shot_results_dict"] = (
        generated_ins_on_few_shot_results_dict
    )
    results_dict["old_ins_on_few_shot_results_dict"] = (
        old_ins_on_few_shot_results_dict
    )
    results_dict["few_shot_index_list_by_step_dict"] = (
        few_shot_index_list_by_step_dict
    )
    results_dict["eval_results"] = eval_results
    results_dict["eval_detailed_results_df_dict"] = (
        eval_detailed_results_df_dict
    )
    with open(os.path.join(save_folder, "results_dict.pkl"), "wb") as fp:
      pickle.dump(results_dict, fp)
    print(f"\nsaved all results to\n{save_folder}")