private def parseAsBigQueryTable()

in ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers/BigSampler.scala [92:132]


  private def parseAsBigQueryTable(tblRef: String): Option[TableReference] =
    Try(BigQueryHelpers.parseTableSpec(tblRef)).toOption

  private def parseAsURI(uri: String): Option[URI] =
    Try(new URI(uri)).toOption

  private def usage(): Unit = {
    // TODO: Rename --exact to something better
    println(s"""BigSampler - a tool for big data sampling
        |Usage: ratatool $command [dataflow_options] [options]
        |
        |  --sample=<percentage>                               Percentage of records to take in sample, a decimal between 0.0 and 1.0
        |  --input=<path>                                      Input file path or BigQuery table
        |  --output=<path>                                     Output file path or BigQuery table
        |  [--fields=<field1,field2,...>]                      An optional list of fields to include in hashing for sampling cohort selection
        |  [--seed=<seed>]                                     An optional seed used in hashing for sampling cohort selection
        |  [--hashAlgorithm=(murmur|farm)]                     An optional arg to select the hashing algorithm for sampling cohort selection. Defaults to FarmHash for BigQuery compatibility
        |  [--distribution=(uniform|stratified)]               An optional arg to sample for a stratified or uniform distribution. Must provide `distributionFields`
        |  [--distributionFields=<field1,field2,...>]          An optional list of fields to sample for distribution. Must provide `distribution`
        |  [--exact]                                           An optional arg for higher precision distribution sampling.
        |  [--byteEncoding=(raw|hex|base64)]                   An optional arg for how to encode fields of type bytes: raw bytes, hex encoded string, or base64 encoded string. Default is to hash raw bytes.
        |  [--bigqueryPartitioning=<day|hour|month|year|null>] An optional arg specifying what partitioning to use for the output BigQuery table, or 'null' for no partitioning. Defaults to day.
        |
        |
        |Since this runs a Scio/Beam pipeline, Dataflow options will have to be provided. At a
        |minimum, the following should be specified:
        |
        |   --project=<gcp-project-id>                GCP Project used to run your job
        |   --runner=DataflowRunner                   Executes the job on Google Cloud Dataflow
        |   --tempLocation=<gcs-path>                 Location for temporary files. GCS bucket must be created prior to running job.
        |
        |The following options are recommended, but may not be necessary.
        |
        |   --serviceAccount=<your-service-account>   Service account used on Dataflow workers. Useful to avoid permissions issues.
        |   --workerMachineType=<machine-type>        Can be tweaked based on your specific needs, but is not necessary.
        |   --maxNumWorkers=<num-workers>             Limits the number of workers (machines) used in the job to avoid using up quota.
        |
        |For more details regarding Dataflow options see here: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
      """.stripMargin)
    sys.exit(1)
  }