in src/main/scala/com/twitter/penguin/korean/qa/BatchGetUnknownNouns.scala [41:68]
def main(args: Array[String]) {
if (args.length != 1) {
println("The first arg should be an input file path of Korean tweets.")
return
}
val chunksWithUnknowns = Source.fromFile(args(0)).getLines().foldLeft(List[ChunkWithTweet]()) {
case (l: List[ChunkWithTweet], line: String) if line.trim.length > 5 =>
chunk(line).flatMap {
case t: KoreanToken if t.pos == KoreanPos.Korean && tokenize(t.text).exists(_.unknown) =>
Some(ChunkWithTweet(t.text, line.trim))
case t: KoreanToken => None
}.toList ::: l
case (l: List[ChunkWithTweet], line: String) => l
}.toSet
chunksWithUnknowns.toSeq.sortBy(_.chunk).foreach {
chunkWithTweet: ChunkWithTweet =>
println(chunkWithTweet.tweet)
println(TwitterKoreanProcessor
.tokenize(chunkWithTweet.tweet)
.mkString(" "))
println(chunkWithTweet.chunk + ": " +
tokenize(chunkWithTweet.chunk).mkString(" "))
println()
}
}