def main()

in src/main/scala/com/twitter/penguin/korean/qa/BatchGetUnknownNouns.scala [41:68]


  def main(args: Array[String]) {
    if (args.length != 1) {
      println("The first arg should be an input file path of Korean tweets.")
      return
    }
    val chunksWithUnknowns = Source.fromFile(args(0)).getLines().foldLeft(List[ChunkWithTweet]()) {
      case (l: List[ChunkWithTweet], line: String) if line.trim.length > 5 =>
        chunk(line).flatMap {
          case t: KoreanToken if t.pos == KoreanPos.Korean && tokenize(t.text).exists(_.unknown) =>
            Some(ChunkWithTweet(t.text, line.trim))
          case t: KoreanToken => None
        }.toList ::: l
      case (l: List[ChunkWithTweet], line: String) => l
    }.toSet

    chunksWithUnknowns.toSeq.sortBy(_.chunk).foreach {
      chunkWithTweet: ChunkWithTweet =>
        println(chunkWithTweet.tweet)
        println(TwitterKoreanProcessor
            .tokenize(chunkWithTweet.tweet)
            .mkString(" "))

        println(chunkWithTweet.chunk + ": " +
            tokenize(chunkWithTweet.chunk).mkString(" "))
        println()
    }

  }