in reader/dataset.py [0:0]
def to_batches(self):
"""This allows the init to control reading settings.
Refer to https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_dataset.
Perform `drop_remainder` behavior to afix batch size.
This does not shift our data distribution bc of volume and file-level shuffling on every repeat.
"""
batch_size = self._dataset_kwargs["batch_size"]
while True:
ds = self._create_dataset()
for batch in ds.to_batches(**self._dataset_kwargs):
if batch.num_rows < batch_size:
logging.info(f"Dropping remainder ({batch.num_rows}/{batch_size})")
break
yield batch