private def isProperPhraseChunk()

in src/main/scala/com/twitter/penguin/korean/phrase_extractor/KoreanPhraseExtractor.scala [106:134]


  private def isProperPhraseChunk(phraseChunk: KoreanPhraseChunk): Boolean = {
    def notEndingInNonPhraseSuffix: Boolean = {
      val lastToken = phraseChunk.last.tokens.last
      !(lastToken.pos == Suffix && lastToken.text == "적")
    }

    def isRightLength: Boolean = {
      val phraseChunkWithoutSpaces: Seq[KoreanPhrase] = phraseChunk.filter(_.pos != Space)

      def checkMaxLength: Boolean = {
        phraseChunkWithoutSpaces.length <= MaxPhrasesPerPhraseChunk &&
          phraseChunkWithoutSpaces.map(_.length).sum <= MaxCharsPerPhraseChunkWithoutSpaces
      }

      def checkMinLength: Boolean = {
        phraseChunkWithoutSpaces.length >= MinPhrasesPerPhraseChunk ||
          (phraseChunkWithoutSpaces.length < MinPhrasesPerPhraseChunk &&
            phraseChunkWithoutSpaces.map(_.length).sum >= MinCharsPerPhraseChunkWithoutSpaces)
      }

      def checkMinLengthPerToken: Boolean = {
        phraseChunkWithoutSpaces.exists(_.length > 1)
      }

      checkMaxLength && checkMinLength && checkMinLengthPerToken
    }

    isRightLength && notEndingInNonPhraseSuffix
  }