in src/main/scala/com/twitter/penguin/korean/normalizer/KoreanNormalizer.scala [57:82]
private[this] def normalizeKoreanChunk(input: CharSequence): CharSequence = {
// Normalize endings: 안됔ㅋㅋㅋ -> 안돼ㅋㅋ
val endingNormalized = KOREAN_TO_NORMALIZE_REGEX.replaceAllIn(
input, m => processNormalizationCandidate(m).toString
)
// Normalize repeating chars: ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ -> ㅋㅋ
val exclamationNormalized = REPEATING_CHAR_REGEX.replaceAllIn(
endingNormalized, m => {
Matcher.quoteReplacement(m.group(0).take(2).toString)
}
)
// Normalize repeating chars: 훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍 -> 훌쩍훌쩍
val repeatingNormalized = REPEATING_2CHAR_REGEX.replaceAllIn(
exclamationNormalized, m => {
Matcher.quoteReplacement(m.group(0).take(4).toString)
}
)
// Coda normalization (명사 + ㄴ 첨가 정규화): 소린가 -> 소리인가
val codaNNormalized = normalizeCodaN(repeatingNormalized)
// Typo correction: 하겟다 -> 하겠다
val typoCorrected = correctTypo(codaNNormalized)
// Spaces, tabs, new lines are replaced with a single space.
WHITESPACE_REGEX.replaceAllIn(typoCorrected, " ")
}