def to_batches()

in reader/dataset.py [0:0]


  def to_batches(self):
    """This allows the init to control reading settings.

    Refer to https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_dataset.

    Perform `drop_remainder` behavior to afix batch size.
    This does not shift our data distribution bc of volume and file-level shuffling on every repeat.
    """
    batch_size = self._dataset_kwargs["batch_size"]
    while True:
      ds = self._create_dataset()
      for batch in ds.to_batches(**self._dataset_kwargs):
        if batch.num_rows < batch_size:
          logging.info(f"Dropping remainder ({batch.num_rows}/{batch_size})")
          break
        yield batch