Utils/auto_eval.py (84 lines of code) (raw):

import sys import os import re import pandas as pd from pathlib import Path from dotenv import load_dotenv from langchain_openai import AzureChatOpenAI load_dotenv() sys.path.append(os.getenv('AUTO_LLM_EVAL_PATH')) from evaluator import evaluate_scenario results_path = Path(os.getenv('RESULTS_REPO_PATH')).resolve() criteria_path = Path(__file__).resolve().parent.parent / 'Scenarios' / 'Criteria' def get_model(): """Get the evaluator model.""" # Specify configuration for the AI Dial endpoint openai_endpoint = "https://ai-proxy.lab.epam.com" openai_deployment_name = "gpt-4o-2024-05-13" openai_api_version = "2024-05-01-preview" # Read API key from the environment variables # Putting the key inside the notebook is not secure openai_api_key = os.environ["API_KEY"] # Define GPT-4-omni model model = AzureChatOpenAI( temperature=0, # request deterministic behavior azure_endpoint=openai_endpoint, azure_deployment=openai_deployment_name, api_version=openai_api_version, api_key=openai_api_key, ) return model def save_grading_report(report_path: str, report): """Save the grading report to a file.""" df = pd.DataFrame(report) df.to_csv(report_path, index=False) def construct_category_name(category, dataset, complexity, size): parts = [category] if dataset: parts.append(dataset) if complexity: parts.append(complexity) if size: parts.append(size) return "_".join(parts) def extract_content(file_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # Regular expression to find content between "### Answer:\n" and "\n### Tokens:" pattern = re.compile(r'### Answer:\n(.*?)\n### Tokens:', re.DOTALL) match = pattern.search(content) if match: return match.group(1).strip() else: return None def main(model_name, language="JS"): """Main function to evaluate the scenarios.""" gpt_4_omni = get_model() grading_report = [] base_path = results_path / "Output" / model_name / language report_path = base_path / "grading.csv" summary_path = base_path / "summary.csv" if not summary_path.exists(): print(f"File {summary_path} does not exist.") return df = pd.read_csv(summary_path) for index, row in df.iterrows(): experiment = row['Experiment'] category = row['Category'] dataset = row['Dataset'] if row['Dataset'] != 'none' else '' complexity = row['Complexity'] if row['Complexity'] != 'none' else '' size = row['Size'] if row['Size'] != 'none' else '' category_name = construct_category_name(category, dataset, complexity, size) for root, dirs, files in os.walk(base_path / experiment): if category_name in dirs: category_path = Path(root) / category_name category_criteria_path = criteria_path / language / experiment / f'{category_name}_criteria.yaml' if not category_criteria_path.exists(): print(f"File {category_criteria_path} does not exist.") continue output = extract_content(category_path / f'{category_name}_report_1.md') (accuracy, completeness) = evaluate_scenario( category_path, output, category_criteria_path, gpt_4_omni ) accuracy_data = accuracy.to_data_frame("accuracy") grading_report.append(accuracy_data) completeness_data = completeness.to_data_frame("completeness") grading_report.append(completeness_data) # add the normalized results to the summary df.at[index, 'Accuracy'] = float(accuracy_data['weighted_score'] - 1) df.at[index, 'Completeness'] = float(completeness_data['weighted_score'] - 1) df.to_csv(summary_path, index=False) save_grading_report(report_path, grading_report) if __name__ == "__main__": main('ChatGPT4o_august_0509', 'JS')