private[this] def normalizeKoreanChunk()

in src/main/scala/com/twitter/penguin/korean/normalizer/KoreanNormalizer.scala [57:82]

18 lines of code
1 McCabe index (conditional complexity)


  private[this] def normalizeKoreanChunk(input: CharSequence): CharSequence = {
    // Normalize endings: 안됔ㅋㅋㅋ -> 안돼ㅋㅋ
    val endingNormalized = KOREAN_TO_NORMALIZE_REGEX.replaceAllIn(
      input, m => processNormalizationCandidate(m).toString
    )

    // Normalize repeating chars: ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ -> ㅋㅋ
    val exclamationNormalized = REPEATING_CHAR_REGEX.replaceAllIn(
      endingNormalized, m => {
        Matcher.quoteReplacement(m.group(0).take(2).toString)
      }
    )
    // Normalize repeating chars: 훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍훌쩍 -> 훌쩍훌쩍
    val repeatingNormalized = REPEATING_2CHAR_REGEX.replaceAllIn(
      exclamationNormalized, m => {
        Matcher.quoteReplacement(m.group(0).take(4).toString)
      }
    )

    // Coda normalization (명사 + ㄴ 첨가 정규화): 소린가 -> 소리인가
    val codaNNormalized = normalizeCodaN(repeatingNormalized)
    // Typo correction: 하겟다 -> 하겠다
    val typoCorrected = correctTypo(codaNNormalized)
    // Spaces, tabs, new lines are replaced with a single space.
    WHITESPACE_REGEX.replaceAllIn(typoCorrected, " ")
  }