in sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunner.java [315:453]
public DataflowPipelineJob run(Pipeline pipeline) {
LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications "
+ "related to Google Compute Engine usage and other Google Cloud Services.");
List<DataflowPackage> packages = options.getStager().stageFiles();
JobSpecification jobSpecification = translator.translate(pipeline, packages);
Job newJob = jobSpecification.getJob();
// Set a unique client_request_id in the CreateJob request.
// This is used to ensure idempotence of job creation across retried
// attempts to create a job. Specifically, if the service returns a job with
// a different client_request_id, it means the returned one is a different
// job previously created with the same job name, and that the job creation
// has been effectively rejected. The SDK should return
// Error::Already_Exists to user in that case.
int randomNum = new Random().nextInt(9000) + 1000;
String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC)
.print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
newJob.setClientRequestId(requestId);
String version = DataflowReleaseInfo.getReleaseInfo().getVersion();
System.out.println("Dataflow SDK version: " + version);
newJob.getEnvironment().setUserAgent(DataflowReleaseInfo.getReleaseInfo());
// The Dataflow Service may write to the temporary directory directly, so
// must be verified.
DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
if (!Strings.isNullOrEmpty(options.getTempLocation())) {
newJob.getEnvironment().setTempStoragePrefix(
dataflowOptions.getPathValidator().verifyPath(options.getTempLocation()));
}
newJob.getEnvironment().setDataset(options.getTempDatasetId());
newJob.getEnvironment().setExperiments(options.getExperiments());
// Requirements about the service.
Map<String, Object> environmentVersion = new HashMap<>();
environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_MAJOR_KEY, ENVIRONMENT_MAJOR_VERSION);
newJob.getEnvironment().setVersion(environmentVersion);
// Default jobType is JAVA_BATCH_AUTOSCALING: A Java job with workers that the job can
// autoscale if specified.
String jobType = "JAVA_BATCH_AUTOSCALING";
if (options.isStreaming()) {
jobType = "STREAMING";
}
environmentVersion.put(PropertyNames.ENVIRONMENT_VERSION_JOB_TYPE_KEY, jobType);
if (hooks != null) {
hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
}
if (!Strings.isNullOrEmpty(options.getDataflowJobFile())) {
try (PrintWriter printWriter = new PrintWriter(
new File(options.getDataflowJobFile()))) {
String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
printWriter.print(workSpecJson);
LOG.info("Printed workflow specification to {}", options.getDataflowJobFile());
} catch (IllegalStateException ex) {
LOG.warn("Cannot translate workflow spec to json for debug.");
} catch (FileNotFoundException ex) {
LOG.warn("Cannot create workflow spec output file.");
}
}
String jobIdToUpdate = null;
if (options.getUpdate()) {
jobIdToUpdate = getJobIdFromName(options.getJobName());
newJob.setTransformNameMapping(options.getTransformNameMapping());
newJob.setReplaceJobId(jobIdToUpdate);
}
Job jobResult;
try {
jobResult = dataflowClient
.projects()
.jobs()
.create(options.getProject(), newJob)
.execute();
} catch (GoogleJsonResponseException e) {
String errorMessages = "Unexpected errors";
if (e.getDetails() != null) {
if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
errorMessages = "The size of the serialized JSON representation of the pipeline "
+ "exceeds the allowable limit. "
+ "For more information, please check the FAQ link below:\n"
+ "https://cloud.google.com/dataflow/faq";
} else {
errorMessages = e.getDetails().getMessage();
}
}
throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
} catch (IOException e) {
throw new RuntimeException("Failed to create a workflow job", e);
}
// Obtain all of the extractors from the PTransforms used in the pipeline so the
// DataflowPipelineJob has access to them.
AggregatorPipelineExtractor aggregatorExtractor = new AggregatorPipelineExtractor(pipeline);
Map<Aggregator<?, ?>, Collection<PTransform<?, ?>>> aggregatorSteps =
aggregatorExtractor.getAggregatorSteps();
DataflowAggregatorTransforms aggregatorTransforms =
new DataflowAggregatorTransforms(aggregatorSteps, jobSpecification.getStepNames());
// Use a raw client for post-launch monitoring, as status calls may fail
// regularly and need not be retried automatically.
DataflowPipelineJob dataflowPipelineJob =
new DataflowPipelineJob(options.getProject(), jobResult.getId(),
Transport.newRawDataflowClient(options).build(), aggregatorTransforms);
// If the service returned client request id, the SDK needs to compare it
// with the original id generated in the request, if they are not the same
// (i.e., the returned job is not created by this request), throw
// DataflowJobAlreadyExistsException or DataflowJobAlreadyUpdatedExcetpion
// depending on whether this is a reload or not.
if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty()
&& !jobResult.getClientRequestId().equals(requestId)) {
// If updating a job.
if (options.getUpdate()) {
throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob,
String.format("The job named %s with id: %s has already been updated into job id: %s "
+ "and cannot be updated again.",
newJob.getName(), jobIdToUpdate, jobResult.getId()));
} else {
throw new DataflowJobAlreadyExistsException(dataflowPipelineJob,
String.format("There is already an active job named %s with id: %s. If you want "
+ "to submit a second job, try again by setting a different name using --jobName.",
newJob.getName(), jobResult.getId()));
}
}
LOG.info("To access the Dataflow monitoring console, please navigate to {}",
MonitoringUtil.getJobMonitoringPageURL(options.getProject(), jobResult.getId()));
System.out.println("Submitted job: " + jobResult.getId());
LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}",
MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
return dataflowPipelineJob;
}