def main()

in evaluate.py [0:0]


def main():
    """Main function to evaluate the scenarios."""
    parser = argparse.ArgumentParser(
        description="Evaluate benchmark scenarios with LLM."
    )

    group = parser.add_argument_group("pathes")

    group.add_argument(
        "--data-dir",
        type=str,
        required=True,
        help="Root directory of the evaluated dataset",
    )

    group.add_argument(
        "--report-path",
        type=str,
        required=False,
        help="Path to save the grading report",
        default=None,
    )

    group = parser.add_argument_group("scenarios")

    exclusive_group = group.add_mutually_exclusive_group(required=True)

    exclusive_group.add_argument(
        "--scenarios",
        type=int,
        help="Number of scenarios to evaluate",
    )

    exclusive_group.add_argument(
        "--scenario-ranges",
        type=str,
        help="Range(s) of scenarios to evaluate. Sample: 1,3,5-10",
    )

    args = parser.parse_args()

    scenarios = list()

    if args.scenarios:
        scenarios = list(range(1, args.scenarios + 1))
    else:
        scenarios = parse_scenarios_ranges(args.scenario_ranges)

    try:
        data_dir = validate_data_path(args.data_dir)
        if args.report_path:
            report_path = validate_report_path(args.report_path)
        else:
            report_path = os.path.join(data_dir, "grades.csv")

    except (ValueError, FileNotFoundError, FileExistsError) as e:
        logger.error("Error: %s", e)
        return 1

    gpt_4_omni = get_gpt4_model()
    o1_mini = get_o1_mini_model()

    grading_report = []

    for filename in os.listdir(data_dir):
        if is_valid_scenario(filename, data_dir, scenarios):
            scenario_id = filename
            (accuracy, completeness) = evaluate_scenario(
                base_path=data_dir,
                scenario_id=scenario_id,
                evaluation_model=o1_mini,
                grading_model=gpt_4_omni
            )

            grading_report.append(accuracy.to_data_frame("accuracy"))
            grading_report.append(completeness.to_data_frame("completeness"))

            scenarios.remove(int(scenario_id))

    save_grading_report(report_path, grading_report)

    if len(scenarios) > 0:
        logger.warning("Missed scenario(s): %s", scenarios)