in Utils/auto_eval.py [0:0]
def main(model_name, language="JS"):
"""Main function to evaluate the scenarios."""
gpt_4_omni = get_model()
grading_report = []
base_path = results_path / "Output" / model_name / language
report_path = base_path / "grading.csv"
summary_path = base_path / "summary.csv"
if not summary_path.exists():
print(f"File {summary_path} does not exist.")
return
df = pd.read_csv(summary_path)
for index, row in df.iterrows():
experiment = row['Experiment']
category = row['Category']
dataset = row['Dataset'] if row['Dataset'] != 'none' else ''
complexity = row['Complexity'] if row['Complexity'] != 'none' else ''
size = row['Size'] if row['Size'] != 'none' else ''
category_name = construct_category_name(category, dataset, complexity, size)
for root, dirs, files in os.walk(base_path / experiment):
if category_name in dirs:
category_path = Path(root) / category_name
category_criteria_path = criteria_path / language / experiment / f'{category_name}_criteria.yaml'
if not category_criteria_path.exists():
print(f"File {category_criteria_path} does not exist.")
continue
output = extract_content(category_path / f'{category_name}_report_1.md')
(accuracy, completeness) = evaluate_scenario(
category_path, output, category_criteria_path, gpt_4_omni
)
accuracy_data = accuracy.to_data_frame("accuracy")
grading_report.append(accuracy_data)
completeness_data = completeness.to_data_frame("completeness")
grading_report.append(completeness_data)
# add the normalized results to the summary
df.at[index, 'Accuracy'] = float(accuracy_data['weighted_score'] - 1)
df.at[index, 'Completeness'] = float(completeness_data['weighted_score'] - 1)
df.to_csv(summary_path, index=False)
save_grading_report(report_path, grading_report)