in ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers/BigSampler.scala [92:132]
private def parseAsBigQueryTable(tblRef: String): Option[TableReference] =
Try(BigQueryHelpers.parseTableSpec(tblRef)).toOption
private def parseAsURI(uri: String): Option[URI] =
Try(new URI(uri)).toOption
private def usage(): Unit = {
// TODO: Rename --exact to something better
println(s"""BigSampler - a tool for big data sampling
|Usage: ratatool $command [dataflow_options] [options]
|
| --sample=<percentage> Percentage of records to take in sample, a decimal between 0.0 and 1.0
| --input=<path> Input file path or BigQuery table
| --output=<path> Output file path or BigQuery table
| [--fields=<field1,field2,...>] An optional list of fields to include in hashing for sampling cohort selection
| [--seed=<seed>] An optional seed used in hashing for sampling cohort selection
| [--hashAlgorithm=(murmur|farm)] An optional arg to select the hashing algorithm for sampling cohort selection. Defaults to FarmHash for BigQuery compatibility
| [--distribution=(uniform|stratified)] An optional arg to sample for a stratified or uniform distribution. Must provide `distributionFields`
| [--distributionFields=<field1,field2,...>] An optional list of fields to sample for distribution. Must provide `distribution`
| [--exact] An optional arg for higher precision distribution sampling.
| [--byteEncoding=(raw|hex|base64)] An optional arg for how to encode fields of type bytes: raw bytes, hex encoded string, or base64 encoded string. Default is to hash raw bytes.
| [--bigqueryPartitioning=<day|hour|month|year|null>] An optional arg specifying what partitioning to use for the output BigQuery table, or 'null' for no partitioning. Defaults to day.
|
|
|Since this runs a Scio/Beam pipeline, Dataflow options will have to be provided. At a
|minimum, the following should be specified:
|
| --project=<gcp-project-id> GCP Project used to run your job
| --runner=DataflowRunner Executes the job on Google Cloud Dataflow
| --tempLocation=<gcs-path> Location for temporary files. GCS bucket must be created prior to running job.
|
|The following options are recommended, but may not be necessary.
|
| --serviceAccount=<your-service-account> Service account used on Dataflow workers. Useful to avoid permissions issues.
| --workerMachineType=<machine-type> Can be tweaked based on your specific needs, but is not necessary.
| --maxNumWorkers=<num-workers> Limits the number of workers (machines) used in the job to avoid using up quota.
|
|For more details regarding Dataflow options see here: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
""".stripMargin)
sys.exit(1)
}