in ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers/package.scala [119:150]
def sampleTableRow(
coll: SCollection[TableRow],
fraction: Double,
schema: TableSchema,
fields: Seq[String] = Seq(),
seed: Option[Int] = None,
hashAlgorithm: HashAlgorithm = FarmHash,
distribution: Option[SampleDistribution] = None,
distributionFields: Seq[String] = Seq(),
precision: Precision = Approximate,
maxKeySize: Int = 1e6.toInt,
byteEncoding: ByteEncoding = RawEncoding
): SCollection[TableRow] = {
val schemaStr = JsonSerDe.toJsonString(schema)
@transient lazy val schemaFields =
JsonSerDe.fromJsonString(schemaStr, classOf[TableSchema]).getFields.asScala.toList
BigSampler.sample(
coll,
fraction,
fields,
seed,
hashAlgorithm,
distribution,
distributionFields,
precision,
BigSamplerBigQuery.hashTableRow(schemaFields),
BigSamplerBigQuery.buildKey(schemaFields, distributionFields),
maxKeySize,
byteEncoding
)
}