in src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala [119:204]
private[this] def parseKoreanChunk(chunk: KoreanToken,
profile: TokenizerProfile = TokenizerProfile.defaultProfile): Seq[KoreanToken] = {
// Direct match
// This may produce 하 -> PreEomi
koreanDictionary.foreach {
case (pos, dict) =>
if (dict.contains(chunk.text)) {
return Seq(KoreanToken(chunk.text, pos, chunk.offset, chunk.length))
}
}
// Buffer for solutions
val solutions: mutable.Map[Int, List[CandidateParse]] = new java.util.HashMap[Int, List[CandidateParse]]
// Initial state
solutions += 0 -> List(
CandidateParse(
ParsedChunk(Seq[KoreanToken](), 1, profile),
koreanPosTrie, ending = None
)
)
// Find N best parses per state
for (
end <- 1 to chunk.length;
start <- end - 1 to(Seq(end - MAX_TRACE_BACK, 0).max, -1)
) {
val word = chunk.text.slice(start, end)
val curSolutions = solutions(start)
val candidates = curSolutions.flatMap {
solution =>
val possiblePoses: Seq[PossibleTrie] = if (solution.ending.isDefined) {
solution.curTrie.map(t => PossibleTrie(t, 0)) ++ koreanPosTrie.map(
t => PossibleTrie(t, 1))
} else {
solution.curTrie.map(t => PossibleTrie(t, 0))
}
possiblePoses.view.filter { t =>
t.curTrie.curPos == Noun || koreanDictionary(t.curTrie.curPos).contains(
word.toCharArray)
}.map { case t: PossibleTrie =>
val candidateToAdd =
if (t.curTrie.curPos == Noun && !koreanDictionary(Noun).contains(word.toCharArray)) {
val isWordName: Boolean = isName(word)
val isWordKoreanNameVariation: Boolean = isKoreanNameVariation(word)
val unknown = !isWordName && !isKoreanNumber(word) && !isWordKoreanNameVariation
val pos = if (unknown || isWordName || isWordKoreanNameVariation) ProperNoun else Noun
ParsedChunk(Seq(KoreanToken(word, pos, chunk.offset + start, word.length, unknown)),
t.words, profile)
} else {
val pos = if (t.curTrie.curPos == Noun && properNouns.contains(
word.toCharArray)) ProperNoun
else t.curTrie.curPos
ParsedChunk(Seq(KoreanToken(word, pos, chunk.offset + start, word.length)), t.words,
profile)
}
val nextTrie = t.curTrie.nextTrie.map {
case nt: KoreanPosTrie if nt == selfNode => t.curTrie
case nt: KoreanPosTrie => nt
}
CandidateParse(solution.parse ++ candidateToAdd, nextTrie, t.curTrie.ending)
}
}
val currentSolutions = if (solutions.contains(end)) solutions(end) else List()
solutions += end -> (currentSolutions ++ candidates).sortBy {
c => (c.parse.score, c.parse.posTieBreaker)
}.take(TOP_N_PER_STATE)
}
if (solutions(chunk.length).isEmpty) {
// If the chunk is not parseable, treat it as a unknown noun chunk.
Seq(KoreanToken(chunk.text, Noun, 0, chunk.length, true))
} else {
// Return the best parse of the final state
solutions(chunk.length).minBy(c => c.parse.score).parse.posNodes
}
}