in evaluate.py [0:0]
def main():
"""Main function to evaluate the scenarios."""
parser = argparse.ArgumentParser(
description="Evaluate benchmark scenarios with LLM."
)
group = parser.add_argument_group("pathes")
group.add_argument(
"--data-dir",
type=str,
required=True,
help="Root directory of the evaluated dataset",
)
group.add_argument(
"--report-path",
type=str,
required=False,
help="Path to save the grading report",
default=None,
)
group = parser.add_argument_group("scenarios")
exclusive_group = group.add_mutually_exclusive_group(required=True)
exclusive_group.add_argument(
"--scenarios",
type=int,
help="Number of scenarios to evaluate",
)
exclusive_group.add_argument(
"--scenario-ranges",
type=str,
help="Range(s) of scenarios to evaluate. Sample: 1,3,5-10",
)
args = parser.parse_args()
scenarios = list()
if args.scenarios:
scenarios = list(range(1, args.scenarios + 1))
else:
scenarios = parse_scenarios_ranges(args.scenario_ranges)
try:
data_dir = validate_data_path(args.data_dir)
if args.report_path:
report_path = validate_report_path(args.report_path)
else:
report_path = os.path.join(data_dir, "grades.csv")
except (ValueError, FileNotFoundError, FileExistsError) as e:
logger.error("Error: %s", e)
return 1
gpt_4_omni = get_gpt4_model()
o1_mini = get_o1_mini_model()
grading_report = []
for filename in os.listdir(data_dir):
if is_valid_scenario(filename, data_dir, scenarios):
scenario_id = filename
(accuracy, completeness) = evaluate_scenario(
base_path=data_dir,
scenario_id=scenario_id,
evaluation_model=o1_mini,
grading_model=gpt_4_omni
)
grading_report.append(accuracy.to_data_frame("accuracy"))
grading_report.append(completeness.to_data_frame("completeness"))
scenarios.remove(int(scenario_id))
save_grading_report(report_path, grading_report)
if len(scenarios) > 0:
logger.warning("Missed scenario(s): %s", scenarios)