private[samplers] def sample()

in ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers/BigSamplerParquet.scala [33:77]


  private[samplers] def sample(
    sc: ScioContext,
    input: String,
    output: String,
    fields: Seq[String],
    fraction: Double,
    seed: Option[Int],
    hashAlgorithm: HashAlgorithm,
    distribution: Option[SampleDistribution],
    distributionFields: Seq[String],
    precision: Precision,
    maxKeySize: Int,
    byteEncoding: ByteEncoding = RawEncoding
  ): ClosedTap[GenericRecord] = {
    val schema = ParquetIO.getAvroSchemaFromFile(input)
    val outputParts = if (output.endsWith("/")) output + "part*" else output + "/part*"

    implicit val coder: Coder[GenericRecord] = avroGenericRecordCoder(schema)
    if (FileStorage(outputParts).isDone) {
      log.info(s"Reuse previous sample at $outputParts")
      ClosedTap(MaterializeTap[GenericRecord](outputParts, sc))
    } else {
      log.info(s"Will sample from: $input, output will be $output")

      val coll = sc.parquetAvroFile[GenericRecord](input, schema).map(identity)

      val sampledCollection = sampleAvro[GenericRecord](
        coll,
        fraction,
        schema,
        fields,
        seed,
        hashAlgorithm,
        distribution,
        distributionFields,
        precision,
        maxKeySize,
        byteEncoding
      )

      val r = sampledCollection.saveAsParquetAvroFile(output, schema = schema)
      sc.run().waitUntilDone()
      r
    }
  }