Utils/prepare_data.py (56 lines of code) (raw):
import os
import json
from pathlib import Path
from Utils.constants import repo_to_complexity
base_path = Path(__file__).resolve().parent.parent
def traverse_files_and_generate_questions(root_folder: str):
output = []
for subdir, dirs, files in os.walk(root_folder):
for file in files:
file_path = os.path.join(subdir, file)
relative_path = os.path.relpath(file_path, root_folder)
extension = file.split('.')[-1]
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
output.append(f'{relative_path}\n'
f'```{extension}\n{content}\n```\n')
print(f'Appending {relative_path}')
except UnicodeDecodeError:
print(f'Can\'t read file {file_path}, it\'s not a text file')
return "\n".join(output)
def use_template_and_write(template_file_path: str, output_file_path: str, replacement_string: str):
with open(template_file_path, 'r') as input_file:
content = input_file.read()
modified_content = content.replace('<place code here>', replacement_string)
with open(output_file_path, 'w', encoding="utf-8") as output_file:
output_file.write(modified_content)
def modify_output_filename(output_filename: str, repo_name: str):
complexity = repo_to_complexity.get(repo_name)
name, extension = output_filename.rsplit('.', 1)
return f'{name}_{repo_name}_{complexity}.{extension}'
def generate_questions(templates_path, template, output_path, repo_path=None, repo_name=None):
content_to_insert = traverse_files_and_generate_questions(repo_path) if repo_path else ''
template_path = os.path.join(templates_path, template)
os.makedirs(output_path, exist_ok=True)
output_filename = modify_output_filename(template, repo_name) if repo_name else template
output_file_path = os.path.join(output_path, output_filename)
use_template_and_write(template_path, output_file_path, content_to_insert)
print(f"Output was written to {output_file_path}")
def main(model, lang):
print(f"Starting question generation for {model}")
config_path = base_path / "Config" / model / f"{lang}.json"
print(config_path)
with open(config_path, "r", encoding="utf-8") as f:
config = json.load(f)
for goal_type, repos_mapper in config.items():
templates_path = base_path / "Scenarios" / "Task_Templates" / model / lang / goal_type
output_path = base_path / "Scenarios" / "Compiled_Tasks" / model / lang / goal_type
for template, repos in repos_mapper.items():
if not repos: # no repos to insert
generate_questions(templates_path, template, output_path)
continue
for repo_name in repos: # repos is array
repo_path = base_path / "Dataset" / lang / repo_name
print(repo_name)
generate_questions(templates_path, template, output_path, repo_path, repo_name)