in core/src/main/scala/com/spotify/featran/transformers/mdl/MDLPDiscretizer.scala [54:95]
private def midpoint(x1: Float, x2: Float): Float = (x1 + x2) / 2.0f
def discretize(maxBins: Int = MDLPDiscretizer.DefaultMaxBins): Seq[Double] = {
val featureValues = new java.util.TreeMap[Float, Array[Long]]()
data.foreach { case (label, value) =>
val key = value.toFloat
val i = labels(label)
val x = featureValues.get(key)
if (x == null) {
val y = Array.fill(labels.size)(0L)
y(i) = 1L
featureValues.put(key, y)
} else {
x(i) += 1L
}
}
val cutPoint = if (!featureValues.isEmpty) {
val it = featureValues.asScala.iterator
var (lastX, lastFreqs) = it.next()
var result = List.empty[(Float, Array[Long])]
var accumFreqs = lastFreqs
while (it.hasNext) {
val (x, freqs) = it.next()
if (isBoundary(freqs, lastFreqs)) {
result = (midpoint(x, lastX), accumFreqs) :: result
accumFreqs = Array.fill(labels.size)(0L)
}
lastX = x
lastFreqs = freqs
MDLUtil.plusI(accumFreqs, freqs)
}
(lastX, accumFreqs) :: result
} else {
Nil
}
val minBinWeight: Long = (minBinPercentage * data.length / 100.0).toLong
val finder =
new ThresholdFinder(labels.size, stoppingCriterion, maxBins, minBinWeight)
finder.findThresholds(cutPoint.sortBy(_._1)).map(_.toDouble)
}