in src/main/scala/com/twitter/penguin/korean/tokenizer/KoreanTokenizer.scala [92:110]
def tokenize(text: CharSequence,
profile: TokenizerProfile = TokenizerProfile.defaultProfile
): Seq[KoreanToken] = {
try {
chunk(text).flatMap {
case token: KoreanToken if token.pos == Korean =>
// Get the best parse of each chunk
val parsed = parseKoreanChunk(token, profile)
// Collapse sequence of one-char nouns into one unknown noun: (가Noun 회Noun -> 가회Noun*)
collapseNouns(parsed)
case token: KoreanToken => Seq(token)
}
} catch {
case e: Exception =>
System.err.println(s"Error tokenizing a chunk: $text")
throw e
}
}