private def usage()

in ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala [575:610]
36 lines of code
7 McCabe index (conditional complexity)

  private def usage(): Unit = {
    println(s"""BigDiffy - pair-wise field-level statistical diff
        |Usage: ratatool $command [dataflow_options] [options]
        |
        |  --input-mode=(avro|bigquery)     Diff-ing Avro or BQ records
        |  [--output-mode=(gcs|bigquery)]   Saves to a text file in GCS or a BigQuery dataset. Defaults to GCS
        |  --key=<key>                      '.' separated key field. Specify multiple --key params or multiple ',' separated key fields for multi key usage.
        |  --lhs=<path>                     LHS File path or BigQuery table
        |  --rhs=<path>                     RHS File path or BigQuery table
        |  [--rowRestriction=<filter>]      SQL text filtering statement to apply to BigQuery inputs (not available for avro inputs),
        |                                   similar to a WHERE clause in a query. Aggregates are not supported. Defaults to None
        |  --output=<output>                File path prefix for output
        |  --ignore=<keys>                  ',' separated field list to ignore
        |  --unordered=<keys>               ',' separated field list to treat as unordered
        |  --unorderedFieldKey=<key>        ',' separated list of keys for fields which are unordered nested records. Mappings use ':'
        |                                   For example --unorderedFieldKey=fieldPath:fieldKey,otherPath:otherKey
        |  [--with-header]                  Output all TSVs with header rows. Defaults to false
        |  [--ignore-nan]                   Ignore NaN values when computing stats for differences
        |
        |Since this runs a Scio/Beam pipeline, Dataflow options will have to be provided. At a
        |minimum, the following should be specified:
        |
        |   --project=<gcp-project-id>                GCP Project used to run your job
        |   --runner=DataflowRunner                   Executes the job on Google Cloud Dataflow
        |   --tempLocation=<gcs-path>                 Location for temporary files. GCS bucket must be created prior to running job.
        |
        |The following options are recommended, but may not be necessary.
        |
        |   --serviceAccount=<your-service-account>   Service account used on Dataflow workers. Useful to avoid permissions issues.
        |   --workerMachineType=<machine-type>        Can be tweaked based on your specific needs, but is not necessary.
        |   --maxNumWorkers=<num-workers>             Limits the number of workers (machines) used in the job to avoid using up quota.
        |
        |For more details regarding Dataflow options see here: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
      """.stripMargin)
    sys.exit(1)
  }