in ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala [575:610]
private def usage(): Unit = {
println(s"""BigDiffy - pair-wise field-level statistical diff
|Usage: ratatool $command [dataflow_options] [options]
|
| --input-mode=(avro|bigquery) Diff-ing Avro or BQ records
| [--output-mode=(gcs|bigquery)] Saves to a text file in GCS or a BigQuery dataset. Defaults to GCS
| --key=<key> '.' separated key field. Specify multiple --key params or multiple ',' separated key fields for multi key usage.
| --lhs=<path> LHS File path or BigQuery table
| --rhs=<path> RHS File path or BigQuery table
| [--rowRestriction=<filter>] SQL text filtering statement to apply to BigQuery inputs (not available for avro inputs),
| similar to a WHERE clause in a query. Aggregates are not supported. Defaults to None
| --output=<output> File path prefix for output
| --ignore=<keys> ',' separated field list to ignore
| --unordered=<keys> ',' separated field list to treat as unordered
| --unorderedFieldKey=<key> ',' separated list of keys for fields which are unordered nested records. Mappings use ':'
| For example --unorderedFieldKey=fieldPath:fieldKey,otherPath:otherKey
| [--with-header] Output all TSVs with header rows. Defaults to false
| [--ignore-nan] Ignore NaN values when computing stats for differences
|
|Since this runs a Scio/Beam pipeline, Dataflow options will have to be provided. At a
|minimum, the following should be specified:
|
| --project=<gcp-project-id> GCP Project used to run your job
| --runner=DataflowRunner Executes the job on Google Cloud Dataflow
| --tempLocation=<gcs-path> Location for temporary files. GCS bucket must be created prior to running job.
|
|The following options are recommended, but may not be necessary.
|
| --serviceAccount=<your-service-account> Service account used on Dataflow workers. Useful to avoid permissions issues.
| --workerMachineType=<machine-type> Can be tweaked based on your specific needs, but is not necessary.
| --maxNumWorkers=<num-workers> Limits the number of workers (machines) used in the job to avoid using up quota.
|
|For more details regarding Dataflow options see here: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
""".stripMargin)
sys.exit(1)
}