protected def getCandidatePhraseChunks()

in src/main/scala/com/twitter/penguin/korean/phrase_extractor/KoreanPhraseExtractor.scala [217:297]


  protected def getCandidatePhraseChunks(phrases: KoreanPhraseChunk,
                                         filterSpam: Boolean = false): Seq[KoreanPhraseChunk] = {
    def isNotSpam(phrase: KoreanPhrase): Boolean =
      !filterSpam || !phrase.tokens.exists(
        t => KoreanDictionaryProvider.spamNouns.contains(t.text)
      )

    def isNonNounPhraseCandidate(phrase: KoreanPhrase): Boolean = {
      val trimmed = trimPhrase(phrase)

      // 하는, 할인된, 할인될, exclude: 하지만
      def isModifyingPredicate: Boolean = {
        val lastChar: Char = trimmed.tokens.last.text.last
        (trimmed.pos == Verb || trimmed.pos == Adjective) &&
          ModifyingPredicateEndings.contains(Hangul.decomposeHangul(lastChar).coda) &&
          !ModifyingPredicateExceptions.contains(lastChar)
      }

      // 과, 와, 의
      def isConjuction: Boolean =
        trimmed.pos == Josa && ConjunctionJosa.contains(trimmed.tokens.last.text)

      def isAlphaNumeric: Boolean =
        trimmed.pos == Alpha || trimmed.pos == Number

      isAlphaNumeric || isModifyingPredicate || isConjuction
    }

    def collapseNounPhrases(phrases: KoreanPhraseChunk): KoreanPhraseChunk = {
      val (output, buffer) = phrases.foldLeft((Seq[KoreanPhrase](), Seq[KoreanPhrase]())) {
        case ((output, buffer), phrase) if phrase.pos == Noun || phrase.pos == ProperNoun =>
          (output, buffer :+ phrase)
        case ((output, buffer), phrase) =>
          val tempPhrases = if (buffer.length > 0) {
            Seq(KoreanPhrase(buffer.flatMap(_.tokens)), phrase)
          } else {
            Seq(phrase)
          }
          (output ++ tempPhrases, Seq[KoreanPhrase]())
      }
      if (buffer.length > 0) output :+ KoreanPhrase(buffer.flatMap(_.tokens)) else output
    }

    def collapsePhrases(phrases: KoreanPhraseChunk): Seq[KoreanPhraseChunk] = {
      def addPhraseToBuffer(phrase: KoreanPhrase, buffer: Seq[KoreanPhraseChunk]): Seq[KoreanPhraseChunk] = {
        buffer.map(b => b :+ phrase)
      }

      def newBuffer: Seq[Seq[KoreanPhrase]] = Seq(Seq[KoreanPhrase]())

      val (output, buffer) = phrases.foldLeft((Seq[KoreanPhraseChunk](), newBuffer)) {
        case ((output, buffer), phrase) if PhraseTokens.contains(phrase.pos) && isNotSpam(phrase) =>
          val bufferWithThisPhrase = addPhraseToBuffer(phrase, buffer)
          if (phrase.pos == Noun || phrase.pos == ProperNoun) {
            (output ++ bufferWithThisPhrase, bufferWithThisPhrase)
          } else {
            (output, bufferWithThisPhrase)
          }
        case ((output, buffer), phrase) if isNonNounPhraseCandidate(phrase) =>
          (output, addPhraseToBuffer(phrase, buffer))
        case ((output, buffer), phrase) =>
          (output ++ buffer, newBuffer)
      }
      if (buffer.length > 0) output ++ buffer else output
    }

    def getSingleTokenNouns: Seq[KoreanPhraseChunk] = {
      phrases.filter {
        phrase =>
          val trimmed = trimPhrase(phrase)
          (phrase.pos == Noun || phrase.pos == ProperNoun) && isNotSpam(phrase) &&
            (trimmed.length >= MinCharsPerPhraseChunkWithoutSpaces ||
              trimmed.tokens.length >= MinPhrasesPerPhraseChunk)
      }.map(phrase => Seq(trimPhrase(phrase)))
    }

    val nounPhrases: KoreanPhraseChunk = collapseNounPhrases(phrases)
    val phraseCollapsed = collapsePhrases(nounPhrases)

    distinctPhrases(phraseCollapsed.map(trimPhraseChunk) ++ getSingleTokenNouns)
  }