integration/read-bq-write-bq/it/before.py (47 lines of code) (raw):

# Copyright 2020 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Setup for integration test * Rewrites klio-job.yaml so that the input and output BigQuery tables have names unique to the run * Creates BigQuery tables that need to exist before the test run starts. """ import os import apache_beam as beam import yaml import common def _append_build_id(base_name): """Returns base_name with BQ-friendly `GITHUB_SHA` appended""" build_id = os.environ.get("GITHUB_SHA", None) if not build_id: raise Exception("Unable to get build id; env var GITHUB_SHA not set") # valid BQ table names only allow underscores and alphanumeric chars # https://cloud.google.com/bigquery/docs/tables#table_naming table_name = "{}_{}".format(base_name, build_id.replace("-", "_")) return table_name def rewrite_klio_config_yaml(): """Rewrite `klio-job.yaml` with tablenames that have the GITHUB_SHA appended""" klio_cfg_file_path = os.path.join(os.path.dirname(__file__), "..", "klio-job.yaml") klio_save_file_path = os.path.join(os.path.dirname(__file__), "..", "klio-job.yaml.save") with open(klio_cfg_file_path) as f: config_dict = yaml.safe_load(f) # save the original with open(klio_save_file_path, "w") as g: g.write(yaml.safe_dump(config_dict)) new_input_table_name = _append_build_id(config_dict["job_config"]["events"]["inputs"][0]["table"]) config_dict["job_config"]["events"]["inputs"][0]["table"] = new_input_table_name new_output_table_name = _append_build_id(config_dict["job_config"]["events"]["outputs"][0]["table"]) config_dict["job_config"]["events"]["outputs"][0]["table"] = new_output_table_name with open(klio_cfg_file_path, "w") as g: g.write(yaml.safe_dump(config_dict)) def populate_bigquery_table(): """Create & populate input table based on what is configured for event input in klio-job.yaml This needs to run before `klio job run` is called, which is why """ table_schema = {"fields": [{ 'name': 'entity_id', 'type': 'STRING', 'mode': 'NULLABLE' }]} klio_cfg = common.get_config() input_table_cfg = klio_cfg.job_config.events.inputs[0] table_name = "{}:{}.{}".format(input_table_cfg.project, input_table_cfg.dataset, input_table_cfg.table) with beam.Pipeline() as p: def create_record(v): return { 'entity_id': v, } record_ids = p | 'CreateIDs' >> beam.Create(common.entity_ids) records = record_ids | 'CreateRecords' >> beam.Map(lambda x: create_record(x)) records | 'write' >> beam.io.WriteToBigQuery( table_name, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) if __name__ == '__main__': rewrite_klio_config_yaml() populate_bigquery_table()