in ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers/BigSamplerBigQuery.scala [42:93]
private[samplers] def hashTableRow(
tblSchema: => Seq[TableFieldSchema]
)(r: TableRow, fieldStr: String, hasher: Hasher): Hasher = {
val subfields = fieldStr.split(BigSampler.fieldSep)
val field = tblSchema.find(_.getName == subfields.head).getOrElse {
throw new NoSuchElementException(s"Can't find field `$fieldStr` in the schema $tblSchema")
}
val v = r.get(subfields.head)
if (v == null) {
log.debug(
s"Field `${field.getName}` of type ${field.getType} and mode ${field.getMode}" +
s" is null - won't account for hash"
)
hasher
} else {
val vs = if (field.getMode == "REPEATED") {
v.asInstanceOf[JList[AnyRef]].asScala
} else {
Seq(v)
}
field.getType match {
case "BOOLEAN" =>
vs.foldLeft(hasher)((hasher, v) => hasher.putBoolean(v.toString.toBoolean))
case "INTEGER" => vs.foldLeft(hasher)((hasher, v) => hasher.putLong(v.toString.toLong))
case "FLOAT" => vs.foldLeft(hasher)((hasher, v) => hasher.putFloat(v.toString.toFloat))
case "STRING" =>
vs.foldLeft(hasher)((hasher, v) => hasher.putString(v.toString, BigSampler.utf8Charset))
case "BYTES" =>
vs.foldLeft(hasher)((hasher, v) => hasher.putBytes(v.asInstanceOf[Array[Byte]]))
case "TIMESTAMP" =>
vs.foldLeft(hasher)((hasher, v) => hasher.putString(v.toString, BigSampler.utf8Charset))
case "DATE" =>
vs.foldLeft(hasher)((hasher, v) => hasher.putString(v.toString, BigSampler.utf8Charset))
case "TIME" =>
vs.foldLeft(hasher)((hasher, v) => hasher.putString(v.toString, BigSampler.utf8Charset))
case "DATETIME" =>
vs.foldLeft(hasher)((hasher, v) => hasher.putString(v.toString, BigSampler.utf8Charset))
case "RECORD" =>
vs.foldLeft(hasher)((hasher, vi) =>
hashTableRow(field.getFields.asScala.toList)(
TableRow(vi.asInstanceOf[java.util.Map[String, Any]].asScala.toList: _*),
subfields.tail.mkString(BigSampler.fieldSep.toString),
hasher
)
)
case t =>
throw new UnsupportedOperationException(
s"Type `$t` of field `${field.getName}` is not supported as sampling key"
)
}
}
}