in twml/twml/trainers/data_record_trainer.py [0:0]
def build_hdfs_files_list(
files_list_path, data_dir,
start_datetime, end_datetime, datetime_format,
data_threads, hour_resolution, maybe_save, overwrite):
if files_list_path:
files_list_path = twml.util.preprocess_path(files_list_path)
if isinstance(start_datetime, datetime.datetime):
start_datetime = start_datetime.strftime(datetime_format)
if isinstance(end_datetime, datetime.datetime):
end_datetime = end_datetime.strftime(datetime_format)
list_files_by_datetime_args = {
"base_path": data_dir,
"start_datetime": start_datetime,
"end_datetime": end_datetime,
"datetime_prefix_format": datetime_format,
"extension": "lzo",
"parallelism": data_threads,
"hour_resolution": hour_resolution,
"sort": True,
}
# no cache of data file paths, just get the list by scraping the directory
if not files_list_path or not tf.io.gfile.exists(files_list_path):
# twml.util.list_files_by_datetime returns None if data_dir is None.
# twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
else:
# the cached data file paths file exists.
files_info = twml.util.read_file(files_list_path, decode="json")
# use the cached list if data params match current params,
# or if current params are None
# Not including None checks for datetime_format and hour_resolution,
# since those are shared between eval and training.
if (all(param is None for param in [data_dir, start_datetime, end_datetime]) or
(files_info["data_dir"] == data_dir and
files_info["start_datetime"] == start_datetime and
files_info["end_datetime"] == end_datetime and
files_info["datetime_format"] == datetime_format and
files_info["hour_resolution"] == hour_resolution)):
files_list = files_info["files"]
elif overwrite:
# current params are not none and don't match saved params
# `overwrite` indicates we should thus update the list
files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
else:
# dont update the cached list
raise ValueError("Information in files_list is inconsistent with provided args.\n"
"Did you intend to overwrite files_list using "
"--train.overwrite_files_list or --eval.overwrite_files_list?\n"
"If you instead want to use the paths in files_list, ensure that "
"data_dir, start_datetime, and end_datetime are None.")
if maybe_save and files_list_path and (overwrite or not tf.io.gfile.exists(files_list_path)):
save_dict = {}
save_dict["files"] = files_list
save_dict["data_dir"] = data_dir
save_dict["start_datetime"] = start_datetime
save_dict["end_datetime"] = end_datetime
save_dict["datetime_format"] = datetime_format
save_dict["hour_resolution"] = hour_resolution
twml.util.write_file(files_list_path, save_dict, encode="json")
return files_list