def main()

in src/main/scala/com/twitter/penguin/korean/qa/BatchTokenizeTweets.scala [42:88]


  def main(args: Array[String]) {
    if (args.length != 1) {
      println("The first arg should be an input file of Korean tweets.")
      return
    }
    val parseTimesAll = Source.fromFile(args(0)).getLines().foldLeft(List[ParseTime]()) {
      case (l: List[ParseTime], line: String) =>
        val t0 = System.currentTimeMillis()
        val parsed = TwitterKoreanProcessor.tokenize(line)
        val t1 = System.currentTimeMillis()

        if (VERBOSE) {
          println(parsed.map(t => t.text + "/" + t.pos).mkString(" "))
        }
        ParseTime(t1 - t0, line.trim) :: l
    }

    val loadingTime = parseTimesAll.last

    LOG.log(Level.INFO, "The first one \"%s\" took %d ms including the loading time.".format(loadingTime.chunk, loadingTime.time))

    val parseTimes = parseTimesAll.init

    val averageTweetLength = parseTimes.map(_.chunk.length).sum.toDouble / parseTimes.size

    val averageTime = parseTimes.map(_.time).sum.toDouble / parseTimes.size
    val maxItem = parseTimes.maxBy(_.time)

    LOG.log(Level.INFO, ("Parsed %d items. \n" +
        "       Total time: %d s \n" +
        "       Average tweet length: %.2f chars \n" +
        "       Average time per tweet: %.2f ms \n" +
        "       Max time: %d ms, %s\n" +
        "       Parsed: %s"
        ).format(
          parseTimes.size,
          parseTimes.map(_.time).sum / 1000,
          averageTweetLength,
          averageTime,
          maxItem.time,
          maxItem.chunk,
          TwitterKoreanProcessor.tokenize(maxItem.chunk).map {
            case t if t.unknown => t.text.toString + t.pos + "*"
            case t => t.text + t.pos.toString
          }.mkString(" ")
        ))
  }