def stem()

in src/main/scala/com/twitter/penguin/korean/stemmer/KoreanStemmer.scala [22:69]


  def stem(tokens: Seq[KoreanToken]): Seq[KoreanToken] = {
    if (!tokens.exists(t => t.pos == Verb || t.pos == Adjective)) {
      return tokens
    }

    val stemmed = tokens.foldLeft(List[KoreanToken]()) {
      case (l: List[KoreanToken], token: KoreanToken) if l.nonEmpty && Endings.contains(token.pos) =>
        if (Predicates.contains(l.head.pos)) {
          val prevToken = l.head
          KoreanToken(
            prevToken.text,
            prevToken.pos, prevToken.offset, prevToken.length + token.length, prevToken.unknown
          ) :: l.tail
        } else {
          l
        }
      case (l: List[KoreanToken], token: KoreanToken) if Predicates.contains(token.pos) =>
        val text = predicateStems(token.pos)(token.text)
        KoreanToken(
          text,
          token.pos, token.offset, token.length, token.unknown
        ) :: l
      case (l: List[KoreanToken], token: KoreanToken) => token :: l
    }.reverse

    def validNounHeading(token: KoreanToken): Boolean = {
      val heading = token.text.take(token.text.length - 2)

      val validLength = token.text.length > 2
      val validPos = token.pos == Verb
      val validEndings = EndingsForNouns.contains(token.text.takeRight(2))
      val validNouns = koreanDictionary(Noun).contains(heading)

      validLength && validPos && validEndings && validNouns
    }

    stemmed.flatMap {
      case token if validNounHeading(token) =>
        val heading = token.text.take(token.text.length - 2)
        val ending = token.text.takeRight(2)

        Seq(
          KoreanToken(heading, Noun, token.offset, heading.length),
          KoreanToken(ending, token.pos, token.offset + heading.length, token.length - heading.length)
        )
      case token => Seq(token)
    }
  }