spotify_confidence/examples.py (93 lines of code) (raw):

# Copyright 2017-2020 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pandas as pd import numpy as np from itertools import product def example_data_binomial(): """ Returns an output dataframe with categorical features (country and test variation), and orginal features (date), as well as number of successes and total observations for each combination """ countries = ["ca", "us"] dates = pd.date_range("2018-01-01", "2018-02-01") variation_names = ["test", "control", "test2"] # test ca, test us, control ca, control us, test2 ca, test2 us success_rates = [0.3, 0.32, 0.24, 0.22, 0.25, 0.42] n_observations = [50, 80, 30, 50, 40, 50] return_df = pd.DataFrame() for i, (country, variation) in enumerate(product(countries, variation_names)): df = pd.DataFrame({"date": dates}) df["country"] = country df["variation_name"] = variation df["total"] = np.random.poisson(n_observations[i], size=len(dates)) df["success"] = df["total"].apply(lambda x: np.random.binomial(x, success_rates[i])) return_df = pd.concat([return_df, df], axis=0) return return_df def example_data_gaussian(): df = pd.DataFrame( { "variation_name": [ "test", "control", "test2", "test", "control", "test2", "test", "control", "test2", "test", "control", "test2", "test", "control", "test2", ], "nr_of_items": [ 500, 8, 100, 510, 8, 100, 520, 9, 104, 530, 7, 100, 530, 8, 103, ], "nr_of_items_sumsq": [ 2500, 12, 150, 2510, 13, 140, 2520, 14, 154, 2530, 15, 160, 2530, 16, 103, ], "users": [ 1010, 22, 150, 1000, 20, 153, 1030, 23, 154, 1000, 20, 150, 1040, 21, 155, ], "days_since_reg": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5], } ) return df