in src/main/scala/com/twitter/penguin/korean/phrase_extractor/KoreanPhraseExtractor.scala [217:297]
protected def getCandidatePhraseChunks(phrases: KoreanPhraseChunk,
filterSpam: Boolean = false): Seq[KoreanPhraseChunk] = {
def isNotSpam(phrase: KoreanPhrase): Boolean =
!filterSpam || !phrase.tokens.exists(
t => KoreanDictionaryProvider.spamNouns.contains(t.text)
)
def isNonNounPhraseCandidate(phrase: KoreanPhrase): Boolean = {
val trimmed = trimPhrase(phrase)
// 하는, 할인된, 할인될, exclude: 하지만
def isModifyingPredicate: Boolean = {
val lastChar: Char = trimmed.tokens.last.text.last
(trimmed.pos == Verb || trimmed.pos == Adjective) &&
ModifyingPredicateEndings.contains(Hangul.decomposeHangul(lastChar).coda) &&
!ModifyingPredicateExceptions.contains(lastChar)
}
// 과, 와, 의
def isConjuction: Boolean =
trimmed.pos == Josa && ConjunctionJosa.contains(trimmed.tokens.last.text)
def isAlphaNumeric: Boolean =
trimmed.pos == Alpha || trimmed.pos == Number
isAlphaNumeric || isModifyingPredicate || isConjuction
}
def collapseNounPhrases(phrases: KoreanPhraseChunk): KoreanPhraseChunk = {
val (output, buffer) = phrases.foldLeft((Seq[KoreanPhrase](), Seq[KoreanPhrase]())) {
case ((output, buffer), phrase) if phrase.pos == Noun || phrase.pos == ProperNoun =>
(output, buffer :+ phrase)
case ((output, buffer), phrase) =>
val tempPhrases = if (buffer.length > 0) {
Seq(KoreanPhrase(buffer.flatMap(_.tokens)), phrase)
} else {
Seq(phrase)
}
(output ++ tempPhrases, Seq[KoreanPhrase]())
}
if (buffer.length > 0) output :+ KoreanPhrase(buffer.flatMap(_.tokens)) else output
}
def collapsePhrases(phrases: KoreanPhraseChunk): Seq[KoreanPhraseChunk] = {
def addPhraseToBuffer(phrase: KoreanPhrase, buffer: Seq[KoreanPhraseChunk]): Seq[KoreanPhraseChunk] = {
buffer.map(b => b :+ phrase)
}
def newBuffer: Seq[Seq[KoreanPhrase]] = Seq(Seq[KoreanPhrase]())
val (output, buffer) = phrases.foldLeft((Seq[KoreanPhraseChunk](), newBuffer)) {
case ((output, buffer), phrase) if PhraseTokens.contains(phrase.pos) && isNotSpam(phrase) =>
val bufferWithThisPhrase = addPhraseToBuffer(phrase, buffer)
if (phrase.pos == Noun || phrase.pos == ProperNoun) {
(output ++ bufferWithThisPhrase, bufferWithThisPhrase)
} else {
(output, bufferWithThisPhrase)
}
case ((output, buffer), phrase) if isNonNounPhraseCandidate(phrase) =>
(output, addPhraseToBuffer(phrase, buffer))
case ((output, buffer), phrase) =>
(output ++ buffer, newBuffer)
}
if (buffer.length > 0) output ++ buffer else output
}
def getSingleTokenNouns: Seq[KoreanPhraseChunk] = {
phrases.filter {
phrase =>
val trimmed = trimPhrase(phrase)
(phrase.pos == Noun || phrase.pos == ProperNoun) && isNotSpam(phrase) &&
(trimmed.length >= MinCharsPerPhraseChunkWithoutSpaces ||
trimmed.tokens.length >= MinPhrasesPerPhraseChunk)
}.map(phrase => Seq(trimPhrase(phrase)))
}
val nounPhrases: KoreanPhraseChunk = collapseNounPhrases(phrases)
val phraseCollapsed = collapsePhrases(nounPhrases)
distinctPhrases(phraseCollapsed.map(trimPhraseChunk) ++ getSingleTokenNouns)
}