kong/spec/helpers/perf.lua (360 lines of code) (raw):

local pl_tablex = require("pl.tablex") local logger = require("spec.helpers.perf.logger") local utils = require("spec.helpers.perf.utils") local git = require("spec.helpers.perf.git") local charts = require("spec.helpers.perf.charts") local read_all_env = require("kong.cmd.utils.env").read_all local my_logger = logger.new_logger("[controller]") utils.register_busted_hook() charts.register_busted_hook() -- how many times for each "driver" operation local RETRY_COUNT = 3 local DRIVER local DRIVER_NAME local LAST_KONG_VERSION -- Real user facing functions local driver_functions = { "start_worker", "start_kong", "stop_kong", "setup", "setup_kong", "teardown", "get_start_load_cmd", "get_start_stapxx_cmd", "get_wait_stapxx_cmd", "generate_flamegraph", "save_error_log", "get_admin_uri", "save_pgdump", "load_pgdump", "get_based_version", "remote_execute", } local function check_driver_sanity(mod) if type(mod) ~= "table" then error("Driver must return a table") end for _, func in ipairs(driver_functions) do if not mod[func] then error("Driver " .. debug.getinfo(mod.new, "S").source .. " must implement function " .. func, 2) end end end local known_drivers = { "docker", "terraform" } --- Unset an environment variable -- @function use_driver -- @param name string name of the driver to use -- @param opts[optional] table config parameters passed to the driver -- @return nothing. Throws an error if any. local function use_driver(name, opts) name = name or "docker" if not pl_tablex.find(known_drivers, name) then local err = ("Unknown perf test driver \"%s\", expect one of \"%s\""):format( name, table.concat(known_drivers, "\", \"") ) error(err, 2) end local pok, mod = pcall(require, "spec.helpers.perf.drivers." .. name) if not pok then error(("Unable to load perf test driver %s: %s"):format(name, mod)) end check_driver_sanity(mod) DRIVER = mod.new(opts) DRIVER_NAME = name end --- Set driver operation retry count -- @function set_retry_count -- @param try number the retry time for each driver operation -- @return nothing. local function set_retry_count(try) if type(try) ~= "number" then error("expect a number, got " .. type(try)) end RETRY_COUNT = try end --- Setup a default perf test instance that's ready to use on --- most common cases including Github Actions -- @function use_defaults -- @param try number the retry time for each driver operation -- @return nothing. local function use_defaults() logger.set_log_level(ngx.DEBUG) set_retry_count(3) local driver = os.getenv("PERF_TEST_DRIVER") or "docker" local use_daily_image = os.getenv("PERF_TEST_USE_DAILY_IMAGE") if driver == "terraform" then local seperate_db_node = not not os.getenv("PERF_TEST_SEPERATE_DB_NODE") local tf_provider = os.getenv("PERF_TEST_TERRAFORM_PROVIDER") or "equinix-metal" local tfvars = {} if tf_provider == "equinix-metal" then tfvars = { -- Kong Benchmarking metal_project_id = os.getenv("PERF_TEST_METAL_PROJECT_ID"), -- TODO: use an org token metal_auth_token = os.getenv("PERF_TEST_METAL_AUTH_TOKEN"), metal_plan = os.getenv("PERF_TEST_METAL_PLAN"), -- "c3.small.x86" -- metal_region = ["sv15", "sv16", "la4"], -- not support setting from lua for now metal_os = os.getenv("PERF_TEST_METAL_OS"), -- "ubuntu_20_04", seperate_db_node = seperate_db_node, } elseif tf_provider == "digitalocean" then tfvars = { do_project_name = os.getenv("PERF_TEST_DIGITALOCEAN_PROJECT_NAME"), -- "Benchmark", do_token = os.getenv("PERF_TEST_DIGITALOCEAN_TOKEN"), do_size = os.getenv("PERF_TEST_DIGITALOCEAN_SIZE"), -- "c2-8vpcu-16gb", do_region = os.getenv("PERF_TEST_DIGITALOCEAN_REGION"), --"sfo3", do_os = os.getenv("PERF_TEST_DIGITALOCEAN_OS"), -- "ubuntu-20-04-x64", seperate_db_node = seperate_db_node, } elseif tf_provider == "aws-ec2" then tfvars = { aws_region = os.getenv("PERF_TEST_AWS_REGION"), -- "us-east-2", ec2_instance_type = os.getenv("PERF_TEST_EC2_INSTANCE_TYPE"), -- "c5a.2xlarge", ec2_os = os.getenv("PERF_TEST_EC2_OS"), -- "ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*", seperate_db_node = seperate_db_node, } end use_driver("terraform", { provider = tf_provider, tfvars = tfvars, use_daily_image = use_daily_image, seperate_db_node = seperate_db_node, }) else use_driver(driver, { use_daily_image = use_daily_image, }) end end local function invoke_driver(method, ...) if not DRIVER then error("No driver selected, call use_driver first", 2) end if not DRIVER[method] then my_logger.warn(method, " not implemented by driver ", DRIVER_NAME) return end local happy local r, err for i = 1, RETRY_COUNT + 1 do r, err = DRIVER[method](DRIVER, ...) if not err then happy = true break end my_logger.warn("failed in ", method, ": ", err or "nil", ", tries: ", i) end if not happy then error(method .. " finally failed" .. (RETRY_COUNT > 0 and " after " .. RETRY_COUNT .. " retries" or ""), 2) end return r end local _M = { use_driver = use_driver, set_retry_count = set_retry_count, use_defaults = use_defaults, new_logger = logger.new_logger, set_log_level = logger.set_log_level, setenv = utils.setenv, unsetenv = utils.unsetenv, execute = utils.execute, wait_output = utils.wait_output, parse_docker_image_labels = utils.parse_docker_image_labels, clear_loaded_package = utils.clear_loaded_package, git_checkout = git.git_checkout, git_restore = git.git_restore, get_kong_version = git.get_kong_version, } --- Start the worker (nginx) with given conf with multiple ports -- @function start_worker -- @param conf string the Nginx nginx snippet under server{} context -- @param port_count[optional] number number of ports the upstream listens to; default to 1 -- @return upstream_uri string or table if port_count is more than 1 function _M.start_worker(conf, port_count) port_count = port_count or 1 local ret = invoke_driver("start_worker", conf, port_count) return port_count == 1 and ret[1] or ret end --- Start Kong with given version and conf -- @function start_kong -- @param kong_confs table Kong configuration as a lua table -- @param driver_confs table driver configuration as a lua table -- @return nothing. Throws an error if any. function _M.start_kong(kong_confs, driver_confs) kong_confs = kong_confs or {} for k, v in pairs(read_all_env()) do k = k:match("^KONG_([^=]+)") k = k and k:lower() if k then kong_confs[k] = os.getenv("KONG_" .. k:upper()) end end return invoke_driver("start_kong", kong_confs, driver_confs or {}) end --- Stop Kong -- @function stop_kong -- @return nothing. Throws an error if any. function _M.stop_kong(...) return invoke_driver("stop_kong", ...) end --- Setup environment; it's not necessary if `setup_kong` is called -- @function setup -- @return nothing. Throws an error if any. function _M.setup() return invoke_driver("setup") end --- Installs Kong, setup env vars and return the configured helpers utility -- @function setup -- @param version string Kong version -- @return table the `helpers` utility as if it's require("spec.helpers") function _M.setup_kong(version, kong_confs) LAST_KONG_VERSION = version return invoke_driver("setup_kong", version) end --- Cleanup all the stuff -- @function teardown -- @param full[optional] boolean teardown all stuff, including those will -- make next test spin up faster -- @return nothing. Throws an error if any. function _M.teardown(full) LAST_KONG_VERSION = nil return invoke_driver("teardown", full) end local load_thread local load_should_stop --- Start to send load to Kong -- @function start_load -- @param opts.path[optional] string request path, default to / -- @param opts.uri[optional] string base URI except path, default to http://kong-ip:kong-port/ -- @param opts.connections[optional] number connection count, default to 1000 -- @param opts.threads[optional] number request thread count, default to 5 -- @param opts.duration[optional] number perf test duration in seconds, default to 10 -- @param opts.script[optional] string content of wrk script, default to nil -- @param opts.kong_name[optional] string specify the kong name to send load to; will automatically pick one if not specified -- @return nothing. Throws an error if any. function _M.start_load(opts) if load_thread then error("load is already started, stop it using wait_result() first", 2) end local path = opts.path or "" -- strip leading / if path:sub(1, 1) == "/" then path = path:sub(2) end local prog = opts.wrk2 and "wrk2" or "wrk" if opts.wrk2 then if DRIVER_NAME ~= "terraform" then error("wrk2 not supported in docker driver", 2) elseif not opts.rate then error("wrk2 requires rate", 2) end end local load_cmd_stub = prog .. " -c " .. (opts.connections or 1000) .. " -t " .. (opts.threads or 5) .. " -d " .. (opts.duration or 10) .. "s" .. (opts.wrk2 and " -R " .. opts.rate or "") .. " %s " .. -- script place holder " %s/" .. path .. " --latency" local load_cmd = invoke_driver("get_start_load_cmd", load_cmd_stub, opts.script, opts.uri, opts.kong_name) load_should_stop = false load_thread = ngx.thread.spawn(function() return utils.execute(load_cmd, { stop_signal = function() if load_should_stop then return 9 end end, }) end) end local stapxx_thread local stapxx_should_stop --- Start to send load to Kong -- @function start_stapxx -- @param sample_name string stapxx sample name -- @param arg string extra arguments passed to stapxx script -- @param driver_confs table driver configuration as a lua table -- @return nothing. Throws an error if any. function _M.start_stapxx(sample_name, arg, driver_confs) if stapxx_thread then error("stapxx is already started, stop it using wait_result() first", 2) end local start_cmd = invoke_driver("get_start_stapxx_cmd", sample_name, arg, driver_confs or {}) stapxx_should_stop = false stapxx_thread = ngx.thread.spawn(function() return utils.execute(start_cmd, { stop_signal = function() if stapxx_should_stop then return 3 end end, }) end) local wait_cmd = invoke_driver("get_wait_stapxx_cmd") if not utils.wait_output(wait_cmd, "stap_", 30) then return false, "timeout waiting systemtap probe to load" end return true end --- Wait the load test to finish and get result -- @function wait_result -- @return string the test report text function _M.wait_result(opts) if not load_thread then error("load haven't been started or already collected, " .. "start it using start_load() first", 2) end -- local timeout = opts and opts.timeout or 3 -- local ok, res, err -- ngx.update_time() -- local s = ngx.now() -- while not found and ngx.now() - s <= timeout do -- ngx.update_time() -- ngx.sleep(0.1) -- if coroutine.status(self.load_thread) ~= "running" then -- break -- end -- end -- print(coroutine.status(self.load_thread), coroutine.running(self.load_thread)) -- if coroutine.status(self.load_thread) == "running" then -- self.load_should_stop = true -- return false, "timeout waiting for load to stop (" .. timeout .. "s)" -- end if stapxx_thread then local ok, res, err = ngx.thread.wait(stapxx_thread) stapxx_should_stop = true stapxx_thread = nil if not ok or err then my_logger.warn("failed to wait stapxx to finish: ", (res or "nil"), " err: " .. (err or "nil")) end my_logger.debug("stap++ output: ", res) end local ok, res, err = ngx.thread.wait(load_thread) load_should_stop = true load_thread = nil if not ok or err then error("failed to wait result: " .. (res or "nil") .. " err: " .. (err or "nil")) end return res end local function sum(t) local s = 0 for _, i in ipairs(t) do if type(i) == "number" then s = s + i end end return s end -- Note: could also use custom lua code in wrk local nan = 0/0 local function parse_wrk_result(r) local rps = string.match(r, "Requests/sec:%s+([%d%.]+)") rps = tonumber(rps) local count = string.match(r, "([%d]+)%s+requests in") count = tonumber(count) local lat_avg, avg_m, lat_max, max_m = string.match(r, "Latency%s+([%d%.]+)([mu]?)s%s+[%d%.]+[mu]?s%s+([%d%.]+)([mu]?)s") lat_avg = tonumber(lat_avg or nan) * (avg_m == "u" and 0.001 or (avg_m == "m" and 1 or 1000)) lat_max = tonumber(lat_max or nan) * (max_m == "u" and 0.001 or (max_m == "m" and 1 or 1000)) local p90, p90_m = string.match(r, "90%%%s+([%d%.]+)([mu]?)s") local p99, p99_m = string.match(r, "99%%%s+([%d%.]+)([mu]?)s") p90 = tonumber(p90 or nan) * (p90_m == "u" and 0.001 or (p90_m == "m" and 1 or 1000)) p99 = tonumber(p99 or nan) * (p99_m == "u" and 0.001 or (p99_m == "m" and 1 or 1000)) return rps, count, lat_avg, lat_max, p90, p99 end --- Compute average of RPS and latency from multiple wrk output -- @results table the table holds raw wrk outputs -- @suite string xaxis sutie name -- @return string. The human readable result of average RPS and latency function _M.combine_results(results, suite) local count = #results if count == 0 then return "(no results)" end local rpss = table.new(count, 0) local latencies_avg = table.new(count, 0) local latencies_max = table.new(count, 0) local latencies_p90 = table.new(count, 0) local latencies_p99 = table.new(count, 0) local count = 0 for i, result in ipairs(results) do local r, c, la, lm, p90, p99 = parse_wrk_result(result) rpss[i] = r count = count + c latencies_avg[i] = la * c latencies_max[i] = lm latencies_p90[i] = p90 latencies_p99[i] = p99 end local rps = sum(rpss) / 3 local latency_avg = sum(latencies_avg) / count local latency_max = math.max(unpack(latencies_max)) if LAST_KONG_VERSION then charts.ingest_combined_results(LAST_KONG_VERSION, { rpss = rpss, rps = rps, latencies_p90 = latencies_p90, latencies_p99 = latencies_p99, latency_max = latency_max, latency_avg = latency_avg, }, suite) end return ([[ RPS Avg: %3.2f Latency Avg: %3.2fms Max: %3.2fms P90 (ms): %s P99 (ms): %s ]]):format(rps, latency_avg, latency_max, table.concat(latencies_p90, ", "), table.concat(latencies_p99, ", ")) end --- Wait until the systemtap probe is loaded -- @function wait_stap_probe function _M.wait_stap_probe(timeout) return invoke_driver("wait_stap_probe", timeout or 20) end --- Generate the flamegraph and return SVG -- @function generate_flamegraph -- @param title the title for flamegraph -- @param opts the command line options string(not table) for flamegraph.pl -- @return Nothing. Throws an error if any. function _M.generate_flamegraph(filename, title, opts) if not filename then error("filename must be specified for generate_flamegraph") end if string.sub(filename, #filename-3, #filename):lower() ~= ".svg" then filename = filename .. ".svg" end if not title then title = "Flame graph" end -- If current test is git-based, also attach the Kong binary package -- version it based on if git.is_git_repo() and git.is_git_based() then -- use driver to get the version; driver could implement version override -- based on setups (like using the daily image) local v = invoke_driver("get_based_version") title = title .. " (based on " .. v .. ")" end local out = invoke_driver("generate_flamegraph", title, opts) local f, err = io.open(filename, "w") if not f then error("failed to open " .. filename .. " for writing flamegraph: " .. err) end f:write(out) f:close() my_logger.debug("flamegraph written to ", filename) end --- Enable or disable charts generation -- @function enable_charts -- @param enabled enable or not -- @return Nothing. Throws an error if any. function _M.enable_charts(enabled) return enabled and charts.on() or charts.off() end --- Save Kong error log locally -- @function save_error_log -- @return Nothing. Throws an error if any. function _M.save_error_log(filename) if not filename then error("filename must be specified for save_error_log") end invoke_driver("save_error_log", filename) my_logger.debug("Kong error log written to ", filename) end --- Get the Admin URI accessible from worker -- @function save_error_log -- @param kong_name[optional] string specify the kong name; will automatically pick one if not specified -- @return Nothing. Throws an error if any. function _M.get_admin_uri(kong_name) return invoke_driver("get_admin_uri", kong_name) end --- Save a .sql file of the database -- @function save_pgdump -- @param path string the .sql file path -- @return Nothing. Throws an error if any. function _M.save_pgdump(path) return invoke_driver("save_pgdump", path) end --- Load a .sql file into the database -- @function load_pgdump -- @param path string the .sql file path -- @param dont_patch_service bool set to true to skip update all services -- to upstream started by this framework -- @return Nothing. Throws an error if any. function _M.load_pgdump(path, dont_patch_service) return invoke_driver("load_pgdump", path, dont_patch_service) end -- Execute command on remote instance -- @function remote_execute -- @param node_type string the node to exeute the command on, can be "kong", "db" or "worker" -- @param cmds table the cmds in an array -- @param continue_on_error bool if true, will continue on error function _M.remote_execute(node_type, cmds, continue_on_error) return invoke_driver("remote_execute", node_type, cmds, continue_on_error) end return _M