in src/main/scala/com/twitter/penguin/korean/phrase_extractor/KoreanPhraseExtractor.scala [106:134]
private def isProperPhraseChunk(phraseChunk: KoreanPhraseChunk): Boolean = {
def notEndingInNonPhraseSuffix: Boolean = {
val lastToken = phraseChunk.last.tokens.last
!(lastToken.pos == Suffix && lastToken.text == "적")
}
def isRightLength: Boolean = {
val phraseChunkWithoutSpaces: Seq[KoreanPhrase] = phraseChunk.filter(_.pos != Space)
def checkMaxLength: Boolean = {
phraseChunkWithoutSpaces.length <= MaxPhrasesPerPhraseChunk &&
phraseChunkWithoutSpaces.map(_.length).sum <= MaxCharsPerPhraseChunkWithoutSpaces
}
def checkMinLength: Boolean = {
phraseChunkWithoutSpaces.length >= MinPhrasesPerPhraseChunk ||
(phraseChunkWithoutSpaces.length < MinPhrasesPerPhraseChunk &&
phraseChunkWithoutSpaces.map(_.length).sum >= MinCharsPerPhraseChunkWithoutSpaces)
}
def checkMinLengthPerToken: Boolean = {
phraseChunkWithoutSpaces.exists(_.length > 1)
}
checkMaxLength && checkMinLength && checkMinLengthPerToken
}
isRightLength && notEndingInNonPhraseSuffix
}