in ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers/AvroSampler.scala [110:146]
override def sample(n: Long, head: Boolean): Seq[GenericRecord] = {
require(n > 0, "n must be > 0")
val input = Channels.newInputStream(FileSystems.open(r))
val datumReader = new GenericDatumReader[GenericRecord]()
val fileStream = new DataFileStream[GenericRecord](input, datumReader)
fileStream.getBlockCount
val schema = fileStream.getSchema
logger.debug("Avro schema {}", schema)
val result = ArrayBuffer.empty[GenericRecord]
if (head) {
// read from the start
while (result.size < n && fileStream.hasNext) {
result.append(fileStream.next())
}
} else {
// Reservoir sample imperative way
// Fill result with first n elements
while (result.size < n && fileStream.hasNext) {
result.append(fileStream.next())
}
// Then randomly select from all other elements in the stream
var index = n
while (fileStream.hasNext) {
val next = fileStream.next()
val loc = nextLong(index + 1)
if (loc < n) {
result(loc.toInt) = next
}
index += 1
}
}
result.toList
}