def build_hdfs_files_list()

in twml/twml/trainers/data_record_trainer.py [0:0]


  def build_hdfs_files_list(
      files_list_path, data_dir,
      start_datetime, end_datetime, datetime_format,
      data_threads, hour_resolution, maybe_save, overwrite):
    if files_list_path:
      files_list_path = twml.util.preprocess_path(files_list_path)

    if isinstance(start_datetime, datetime.datetime):
      start_datetime = start_datetime.strftime(datetime_format)
    if isinstance(end_datetime, datetime.datetime):
      end_datetime = end_datetime.strftime(datetime_format)

    list_files_by_datetime_args = {
      "base_path": data_dir,
      "start_datetime": start_datetime,
      "end_datetime": end_datetime,
      "datetime_prefix_format": datetime_format,
      "extension": "lzo",
      "parallelism": data_threads,
      "hour_resolution": hour_resolution,
      "sort": True,
    }

    # no cache of data file paths, just get the list by scraping the directory
    if not files_list_path or not tf.io.gfile.exists(files_list_path):
      # twml.util.list_files_by_datetime returns None if data_dir is None.
      # twml.util.list_files_by_datetime passes through data_dir if data_dir is a list
      files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
    else:
      # the cached data file paths file exists.
      files_info = twml.util.read_file(files_list_path, decode="json")
      # use the cached list if data params match current params,
      #  or if current params are None
      # Not including None checks for datetime_format and hour_resolution,
      #  since those are shared between eval and training.
      if (all(param is None for param in [data_dir, start_datetime, end_datetime]) or
          (files_info["data_dir"] == data_dir and
           files_info["start_datetime"] == start_datetime and
           files_info["end_datetime"] == end_datetime and
           files_info["datetime_format"] == datetime_format and
           files_info["hour_resolution"] == hour_resolution)):
        files_list = files_info["files"]
      elif overwrite:
        # current params are not none and don't match saved params
        # `overwrite` indicates we should thus update the list
        files_list = twml.util.list_files_by_datetime(**list_files_by_datetime_args)
      else:
        # dont update the cached list
        raise ValueError("Information in files_list is inconsistent with provided args.\n"
                         "Did you intend to overwrite files_list using "
                         "--train.overwrite_files_list or --eval.overwrite_files_list?\n"
                         "If you instead want to use the paths in files_list, ensure that "
                         "data_dir, start_datetime, and end_datetime are None.")

    if maybe_save and files_list_path and (overwrite or not tf.io.gfile.exists(files_list_path)):
      save_dict = {}
      save_dict["files"] = files_list
      save_dict["data_dir"] = data_dir
      save_dict["start_datetime"] = start_datetime
      save_dict["end_datetime"] = end_datetime
      save_dict["datetime_format"] = datetime_format
      save_dict["hour_resolution"] = hour_resolution
      twml.util.write_file(files_list_path, save_dict, encode="json")

    return files_list