override def sample()

in ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers/AvroSampler.scala [110:146]


  override def sample(n: Long, head: Boolean): Seq[GenericRecord] = {
    require(n > 0, "n must be > 0")

    val input = Channels.newInputStream(FileSystems.open(r))
    val datumReader = new GenericDatumReader[GenericRecord]()
    val fileStream = new DataFileStream[GenericRecord](input, datumReader)
    fileStream.getBlockCount

    val schema = fileStream.getSchema
    logger.debug("Avro schema {}", schema)

    val result = ArrayBuffer.empty[GenericRecord]
    if (head) {
      // read from the start
      while (result.size < n && fileStream.hasNext) {
        result.append(fileStream.next())
      }
    } else {
      // Reservoir sample imperative way
      // Fill result with first n elements
      while (result.size < n && fileStream.hasNext) {
        result.append(fileStream.next())
      }

      // Then randomly select from all other elements in the stream
      var index = n
      while (fileStream.hasNext) {
        val next = fileStream.next()
        val loc = nextLong(index + 1)
        if (loc < n) {
          result(loc.toInt) = next
        }
        index += 1
      }
    }
    result.toList
  }