in src/main/scala/com/twitter/penguin/korean/qa/BatchTokenizeTweets.scala [42:88]
def main(args: Array[String]) {
if (args.length != 1) {
println("The first arg should be an input file of Korean tweets.")
return
}
val parseTimesAll = Source.fromFile(args(0)).getLines().foldLeft(List[ParseTime]()) {
case (l: List[ParseTime], line: String) =>
val t0 = System.currentTimeMillis()
val parsed = TwitterKoreanProcessor.tokenize(line)
val t1 = System.currentTimeMillis()
if (VERBOSE) {
println(parsed.map(t => t.text + "/" + t.pos).mkString(" "))
}
ParseTime(t1 - t0, line.trim) :: l
}
val loadingTime = parseTimesAll.last
LOG.log(Level.INFO, "The first one \"%s\" took %d ms including the loading time.".format(loadingTime.chunk, loadingTime.time))
val parseTimes = parseTimesAll.init
val averageTweetLength = parseTimes.map(_.chunk.length).sum.toDouble / parseTimes.size
val averageTime = parseTimes.map(_.time).sum.toDouble / parseTimes.size
val maxItem = parseTimes.maxBy(_.time)
LOG.log(Level.INFO, ("Parsed %d items. \n" +
" Total time: %d s \n" +
" Average tweet length: %.2f chars \n" +
" Average time per tweet: %.2f ms \n" +
" Max time: %d ms, %s\n" +
" Parsed: %s"
).format(
parseTimes.size,
parseTimes.map(_.time).sum / 1000,
averageTweetLength,
averageTime,
maxItem.time,
maxItem.chunk,
TwitterKoreanProcessor.tokenize(maxItem.chunk).map {
case t if t.unknown => t.text.toString + t.pos + "*"
case t => t.text + t.pos.toString
}.mkString(" ")
))
}