in src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanChunker.scala [105:127]
private[this] def splitChunks(text: String): List[ChunkMatch] = {
if (text.charAt(0).isSpaceChar) {
List(ChunkMatch(0, text.length, text, Space))
} else {
val chunksBuf = new ListBuffer[ChunkMatch]()
var matchedLen = 0
CHUNKING_ORDER.foreach { pos =>
if (matchedLen < text.length) {
val m = POS_PATTERNS(pos).matcher(text)
while (m.find()) {
val cm = ChunkMatch(m.start, m.end, m.group(), pos)
if (chunksBuf.forall(cm.disjoint)) {
chunksBuf += cm
matchedLen += cm.end - cm.start
}
}
}
}
val chunks = chunksBuf.sortBy(cm => cm.start).toList
fillInUnmatched(text, chunks, Foreign)
}
}