spotify_confidence/analysis/bayesian/bayesian_models.py (395 lines of code) (raw):

# Copyright 2017-2020 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import OrderedDict import chartify import numpy as np import pandas as pd from scipy.stats import beta from spotify_confidence.analysis.bayesian.bayesian_base import ( BaseTest, randomization_warning_decorator, format_str_precision, axis_format_precision, ) from spotify_confidence.analysis.confidence_utils import de_list_if_length_one class BetaBinomial(BaseTest): def __init__( self, data_frame, numerator_column, denominator_column, categorical_group_columns=None, ordinal_group_column=None, prior_alpha_column=None, prior_beta_column=None, interval_size=0.95, ): """ Bayesian BetaBinomial model. See: https://en.wikipedia.org/wiki/Beta-binomial_distribution data_frame (pd.DataFrame): DataFrame numerator_column (str): Column name for numerator column. denominator_column (str): Column name for denominator column. categorical_group_columns (str or list): Column names for categorical groupings. ordinal_group_column (str): Column name for ordinal grouping (e.g. numeric or date values). prior_alpha_column (str): Column name to use for prior alpha. prior_beta_column (str): Column name to use for prior beta. interval_size (float): Size of credible intervals. Default 0.95 """ super().__init__( data_frame, categorical_group_columns, ordinal_group_column, numerator_column, denominator_column, interval_size, ) self._monte_carlo_sample_size = 500000 # Initialize priors. if prior_alpha_column is None or prior_beta_column is None: self._alpha_prior, self._beta_prior = (0.5, 0.5) else: self._alpha_prior = data_frame[prior_alpha_column] self._beta_prior = data_frame[prior_beta_column] def _interval(self, row): interval = beta.interval( self._interval_size, row[self._numerator_column] + self._alpha_prior, row[self._denominator_column] - row[self._numerator_column] + self._beta_prior, ) return interval def _posterior_parameters(self, group_df): """Calculate parameters of posterior distribution. Returns: tuple of floats: posterior_alpha, posterior_beta""" numerator = group_df[self._numerator_column].values[0] denominator = group_df[self._denominator_column].values[0] posterior_alpha = numerator + self._alpha_prior posterior_beta = denominator - numerator + self._beta_prior return posterior_alpha, posterior_beta def _beta_pdf(self, group_df): """Beta pdfs for the given dataframe""" posterior_alpha, posterior_beta = self._posterior_parameters(group_df) epsilon = 0.001 lower_range = beta.isf(1.0 - epsilon, posterior_alpha, posterior_beta) upper_range = beta.isf(epsilon, posterior_alpha, posterior_beta) x_range = np.linspace(lower_range, upper_range, 1000) beta_pdf = [beta.pdf(x, posterior_alpha, posterior_beta) for x in x_range] beta_dist = pd.DataFrame({"x": x_range, "y": beta_pdf}) return beta_dist def _sample_posterior(self, group_df, posterior_sample_size=None): """MCMC sampling of posterior distribution. Used to calculate the posterior distribution of the difference in Beta RVs. Arguments: - seed (int): Seed for random number generator. Set it to make the posteriors deterministic. - posterior_sample_size (int): Number of posterior samples (affects precision) """ if posterior_sample_size is None: posterior_sample_size = self._monte_carlo_sample_size posterior_alpha, posterior_beta = self._posterior_parameters(group_df) posterior_samples = np.random.beta(posterior_alpha, posterior_beta, size=posterior_sample_size) return posterior_samples def _categorical_summary_plot(self, level_name, level_df, remaining_groups, groupby): if not remaining_groups: remaining_groups = groupby grouped_df = level_df.groupby(de_list_if_length_one(remaining_groups)) distributions = pd.DataFrame() for group_name, group_df in grouped_df: beta_dist = self._beta_pdf(group_df) beta_dist["group"] = str(group_name) distributions = pd.concat([distributions, beta_dist], axis=0) # Filter out the long tails of the distributions filtered_xs = distributions.groupby("x")["y"].max().reset_index().loc[lambda x: x["y"] > 0.01] distributions = distributions[distributions["x"].isin(filtered_xs["x"])] # Remove legend if only one color color_column = "group" if len(grouped_df) > 1 else None ch = chartify.Chart() ch.plot.area( distributions, "x", "y", color_column=color_column, stacked=False, color_order=[str(x) for x in list(grouped_df.groups.keys())], ) ch.set_title("Estimate of {} / {}".format(self._numerator_column, self._denominator_column)) if groupby: ch.set_subtitle("{}: {}".format(groupby, level_name)) else: ch.set_subtitle("") ch.axes.set_xaxis_label("{} / {}".format(self._numerator_column, self._denominator_column)) ch.axes.set_yaxis_label("Probability Density") ch.set_source_label("") ch.axes.set_yaxis_range(0) axis_format = axis_format_precision(distributions["x"].min(), distributions["x"].max(), absolute=True) ch.axes.set_xaxis_tick_format(axis_format) ch.style.color_palette.reset_palette_order() # Plot callouts for the means for group_name, group_df in grouped_df: posterior_alpha, posterior_beta = self._posterior_parameters(group_df) posterior_mean = posterior_alpha / (posterior_alpha + posterior_beta) density = beta.pdf(posterior_mean, posterior_alpha, posterior_beta) ch.callout.line( posterior_mean, orientation="height", line_color=ch.style.color_palette.next_color(), line_dash="dashed", ) ch.callout.text( f"{posterior_mean:{format_str_precision(posterior_mean, absolute=False)}}", posterior_mean, density ) ch.axes.hide_yaxis() if color_column: ch.set_legend_location("outside_bottom") return ch def _difference_posteriors(self, data, level_1, level_2, absolute=True): posterior_1 = self._sample_posterior(data.get_group(level_1)) posterior_2 = self._sample_posterior(data.get_group(level_2)) if absolute: difference_posterior = posterior_2 - posterior_1 else: difference_posterior = posterior_2 / posterior_1 - 1.0 return difference_posterior def _differences(self, difference_posterior, level_1, level_2, absolute): # 95% credible interval for posterior credible_interval = ( pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2), pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2 + self._interval_size), ) # Probability that posterior is greater # than zero (count occurences in the MC sample) p_gt_zero = (difference_posterior > 0).mean() expected_loss_v2 = difference_posterior[difference_posterior < 0].sum() / len(difference_posterior) if (difference_posterior > 0).sum() == 0: expected_gain_v2 = 0 else: expected_gain_v2 = difference_posterior[difference_posterior > 0].sum() / len(difference_posterior) expected_loss_v1 = (difference_posterior[difference_posterior * -1.0 < 0] * -1.0).sum() / len( difference_posterior ) if (difference_posterior * -1.0 > 0).sum() == 0: expected_gain_v1 = 0 else: expected_gain_v1 = (difference_posterior[difference_posterior * -1.0 > 0] * -1.0).sum() / len( difference_posterior ) return pd.DataFrame( OrderedDict( [ ("level_1", str(level_1)), ("level_2", str(level_2)), ("absolute_difference", absolute), ("difference", difference_posterior.mean()), ("ci_lower", [credible_interval[0]]), ("ci_upper", [credible_interval[1]]), ("P(level_2 > level_1)", p_gt_zero), ("level_1 potential loss", expected_loss_v1), ("level_1 potential gain", expected_gain_v1), ("level_2 potential loss", expected_loss_v2), ("level_2 potential gain", expected_gain_v2), ] ) ) def _difference(self, level_name, level_df, remaining_groups, groupby, level_1, level_2, absolute): difference_df, _ = self._difference_and_difference_posterior( level_df, remaining_groups, level_2, level_1, absolute ) self._add_group_by_columns(difference_df, groupby, level_name) return difference_df def _difference_and_difference_posterior(self, level_df, remaining_groups, level_2, level_1, absolute): self._validate_levels(level_df, remaining_groups, level_1) self._validate_levels(level_df, remaining_groups, level_2) # difference is posterior_2 - posterior_1 difference_posterior = self._difference_posteriors( level_df.groupby(remaining_groups), level_1, level_2, absolute ) difference_df = self._differences(difference_posterior, level_1, level_2, absolute) return difference_df, difference_posterior @randomization_warning_decorator def difference(self, level_1, level_2, absolute=True, groupby=None): """Return DataFrame with summary statistics of the difference between level 1 and level 2. Args: level_1 (str, tuple of str): Name of first level. level_2 (str, tuple of str): Name of second level. absolute (bool): If True then return the absolute difference (level2 - level1) otherwise return the relative difference (level2 / level1 - 1) groupby (str): Name of column. If specified, will return the difference for each level of the grouped dimension. Returns: Pandas DataFrame with the following columns: - level_1: Name of level 1. - level_2: Name of level 2. - absolute_difference: True if absolute. Absolute: level2 - level1 Relative: level2 / level1 - 1 - difference: Best estimate of the difference between level 2 and 1. Posterior mean of the difference between level 1 and level 2. https://en.wikipedia.org/wiki/Bayes_estimator - ci_lower: Lower credible interval bound of the difference. - ci_upper: Upper credible interval bound of the difference. - P(level_2 > level_1): Probability that the level 2 > level 1. - level_1 potential loss: The expected loss if we switch to level 1, but level 2 is actually better. - level_1 potential gain: The expected gain if we switch to level 1, and level 1 is actually better. - level_2 potential loss: The expected loss if we switch to level 2, but level 1 is actually better. - level_2 potential gain: The expected gain if we switch to level 2, and level 2 is actually better. """ results_df = self._iterate_groupby_to_dataframe( self._difference, groupby=groupby, level_1=level_1, level_2=level_2, absolute=absolute ) return results_df @randomization_warning_decorator def _categorical_difference_plot(self, level_1, level_2, absolute, groupby): chart_grid = self._iterate_groupby_to_chartgrid( self._categorical_difference_plot_, groupby=groupby, level_1=level_1, level_2=level_2, absolute=absolute ) return chart_grid def _categorical_difference_plot_( self, level_name, level_df, remaining_groups, groupby, level_1, level_2, absolute ): difference_df, difference_posterior = self._difference_and_difference_posterior( level_df, remaining_groups, level_2, level_1, absolute ) posterior_mean = difference_df["difference"][0] # potential_loss = difference_df['{} potential loss'.format(level_2)][0] # Take the difference posterior and create a chart df = pd.DataFrame({"values": difference_posterior}) ch = chartify.Chart(y_axis_type="density", x_axis_type="linear") ch.plot.kde(df, "values") ch.set_title("Change from {} to {}".format(level_1, level_2)) subtitle = "" if not groupby else "{}: {}".format(groupby, level_name) ch.set_subtitle(subtitle) # Line at no difference ch.callout.line(0, orientation="height", line_color="black", line_dash="dashed") # ch.callout.text('No change', 0, .5, angle=90) # Plot callout for the mean ch.callout.line( posterior_mean, orientation="height", line_color=ch.style.color_palette._colors[0], line_dash="dashed" ) # ch.callout.text( # '{0:.2f}%'.format(posterior_mean * 100), posterior_mean, 0) ch.callout.text( f"Expected change: {posterior_mean:{format_str_precision(posterior_mean, absolute=False)}}", posterior_mean, 0, angle=90, ) # ch.callout.line( # potential_loss, # orientation='height', # line_color=ch.style.color_palette._colors[1]) # ch.callout.text( # 'Potential Loss: {0:.2f}%'.format(potential_loss * 100), # potential_loss, # 1.5, # angle=90) # ch.callout.text( # '{0:.2f}%'.format(potential_loss * 100), potential_loss, 1.) ch.set_source_label("") ch.axes.set_yaxis_range(0) ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute)) ch.axes.set_yaxis_label("Probability Density") ch.axes.hide_yaxis() axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute) ch.axes.set_xaxis_tick_format(axis_format) return ch def _multiple_difference_joint_dataframe(self, *args, **kwargs): return self._multiple_difference_joint_base(*args, **kwargs)[0] def _multiple_difference_joint_base(self, level_name, level_df, remaining_groups, groupby, level, absolute): grouped_df = level_df.groupby(remaining_groups) grouped_df_keys = tuple(grouped_df.groups.keys()) self._validate_levels(level_df, remaining_groups, level) posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] var_indx = grouped_df_keys.index(level) other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level] posterior_matrix = np.vstack(posteriors) ge_bool_matrix = posterior_matrix[var_indx, :] >= posterior_matrix[:, :] best_arr = ge_bool_matrix.all(axis=0) p_ge_all = best_arr.mean() end_value = posterior_matrix[var_indx] start_value = posterior_matrix[other_indx].max(axis=0) if absolute: difference_posterior = end_value - start_value else: difference_posterior = end_value / start_value - 1 # E(level - best level | level != best) if not (~best_arr).sum(): expected_loss = 0 else: expected_loss = difference_posterior[~best_arr].mean() # E(level - median level | level = best) if not (best_arr).sum(): expected_gain = 0 else: expected_gain = difference_posterior[best_arr].mean() expectation = difference_posterior.mean() ci_l_expectation = pd.Series(difference_posterior).quantile((1.0 - self._interval_size) / 2) ci_u_expectation = pd.Series(difference_posterior).quantile( (1.0 - self._interval_size) / 2 + self._interval_size ) difference_df = pd.DataFrame( OrderedDict( [ ("level", [str(level)]), ("absolute_difference", absolute), ("difference", expectation), ("ci_lower", ci_l_expectation), ("ci_upper", ci_u_expectation), ("P({} >= all)".format(level), p_ge_all), ("{} potential loss".format(level), expected_loss), ("{} potential gain".format(level), expected_gain), ] ) ) self._add_group_by_columns(difference_df, groupby, level_name) return (difference_df, difference_posterior) @randomization_warning_decorator def multiple_difference_joint(self, level, absolute=True, groupby=None): """Calculate the joint probability that the given level is greater than all other levels in the test. Args: level (str, tuple of str): Name of level. absolute (bool): If True then return the absolute difference otherwise return the relative difference. groupby (str): Name of column. If specified, will return an interval for each level of the grouped dimension. Returns: Pandas DataFrame with the following columns: - level: Name of level - absolute_difference: True if absolute. Absolute: level2 - level1 Relative: level2 / level1 - 1 - difference: Difference between the level and the best performing among the other levels. - ci_lower: Lower credible interval bound of the difference. - ci_upper: Upper credible interval bound of the difference. - P(level > all): Probability that the level > all other levels. - potential loss: The expected loss if we switch to level, but some other level is actually better. - potential gain: The expected gain if we switch to level, and it is actually the best. """ results_df = self._iterate_groupby_to_dataframe( self._multiple_difference_joint_dataframe, groupby=groupby, level=level, absolute=absolute ) return results_df def _multiple_difference_joint_plot(self, level_name, level_df, remaining_groups, groupby, level, absolute): self._validate_levels(level_df, remaining_groups, level) difference_df, difference_posterior = self._multiple_difference_joint_base( level_name, level_df, remaining_groups, groupby, level, absolute ) posterior_mean = difference_df.loc[:, "difference"].values[0] # potential_loss = difference_df.loc[:, '{} potential loss'.format( # level)].values[0] # Take the difference posterior and create a chart df = pd.DataFrame({"values": difference_posterior}) ch = chartify.Chart(y_axis_type="density", x_axis_type="linear") ch.plot.kde(df, "values") ch.set_title("Comparison to {}".format(level)) subtitle = "" if not groupby else "{}: {}".format(groupby, level_name) ch.set_subtitle(subtitle) # Line at no difference ch.callout.line(0, orientation="height", line_color="black") # Plot callout for the mean ch.callout.line(posterior_mean, orientation="height", line_color=ch.style.color_palette._colors[0]) ch.callout.text( f"Expected change: {posterior_mean:{format_str_precision(posterior_mean, absolute=False)}}", posterior_mean, 0, angle=90, ) ch.set_source_label("") ch.axes.set_yaxis_range(0) ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute)) ch.axes.set_yaxis_label("Probability Density") ch.axes.hide_yaxis() axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute) ch.axes.set_xaxis_tick_format(axis_format) return ch @randomization_warning_decorator def multiple_difference_joint_plot(self, level, absolute=True, groupby=None): """Calculate the joint probability that the given level is greater than all other levels in the test. Args: level (str, tuple of str): Name of level. absolute (bool): If True then return the absolute difference otherwise return the relative difference. groupby (str): Name of column. If specified, will return an interval for each level of the grouped dimension. Returns: GroupedChart object. """ results_df = self._iterate_groupby_to_chartgrid( self._multiple_difference_joint_plot, groupby=groupby, level=level, absolute=absolute ) return results_df def _multiple_difference( self, level_name, level_df, remaining_groups, groupby, level, absolute, level_as_reference ): grouped_df = level_df.groupby(remaining_groups) grouped_df_keys = tuple(grouped_df.groups.keys()) other_keys = [value for i, value in enumerate(grouped_df_keys) if value != level] for key in other_keys: # Switch the subtraction order as specified. start_value, end_value = level, key if not level_as_reference: start_value, end_value = end_value, start_value difference_df = self._difference( level_name, level_df, remaining_groups, groupby, start_value, end_value, absolute=absolute ) yield difference_df @randomization_warning_decorator def multiple_difference(self, level, absolute=True, groupby=None, level_as_reference=False): """Pairwise comparison of the given level to all others. Args: level (str, tuple of str): Name of level. absolute (bool): If True then return the absolute difference otherwise return the relative difference. groupby (str): Name of column. If specified, will return an interval for each level of the grouped dimension. level_as_reference (bool): If True, the given level is the reference value for the change. (level1) Returns: Pandas DataFrame with the following columns: - groupby (If groupby is not None): Grouped dimension - level_1: Name of level 1. - level_2: Name of level 2. - absolute_difference: True if absolute. Absolute: level2 - level1 Relative: level2 / level1 - 1 - difference: Best estimate of the difference between level 2 and 1. Posterior mean of the difference between level 1 and level 2. https://en.wikipedia.org/wiki/Bayes_estimator - ci_lower: Lower credible interval bound of the difference. - ci_upper: Upper credible interval bound of the difference. - P(level_2 > level_1): Probability that the level 2 > level 1. - level_1 potential loss: The expected loss if we switch to level 1, but level 2 is actually better. - level_1 potential gain: The expected gain if we switch to level 1, and level 1 is actually better. - level_2 potential loss: The expected loss if we switch to level 2, but level 1 is actually better. - level_2 potential gain: The expected gain if we switch to level 2, and level 2 is actually better. """ results_df = self._iterate_groupby_to_dataframe( self._multiple_difference, groupby=groupby, level=level, absolute=absolute, level_as_reference=level_as_reference, ) results_df = results_df.reset_index(drop=True) return results_df def _categorical_multiple_difference_chart( self, level_name, level_df, remaining_groups, groupby, level, absolute, level_as_reference ): grouped_df = level_df.groupby(remaining_groups) grouped_df_keys = tuple(grouped_df.groups.keys()) self._validate_levels(level_df, remaining_groups, level) posteriors = [self._sample_posterior(grouped_df.get_group(level)) for level in grouped_df_keys] var_indx = grouped_df_keys.index(level) other_indx = [i for i, value in enumerate(grouped_df_keys) if value != level] posterior_matrix = np.vstack(posteriors) start_value = posterior_matrix[var_indx] end_value = posterior_matrix if not level_as_reference: start_value, end_value = end_value, start_value if absolute: difference_posterior = end_value - start_value else: difference_posterior = end_value / start_value - 1 df = pd.DataFrame() for group in other_indx: df = pd.concat( [df, pd.DataFrame({"values": difference_posterior[group], "group": str(grouped_df_keys[group])})], axis=0, ) # Take the difference posterior and create a chart # df = pd.DataFrame({'values': difference_posterior}) ch = chartify.Chart(y_axis_type="density", x_axis_type="linear") ch.plot.kde(df, "values", color_column="group") title_change_label = "from" if level_as_reference else "to" ch.set_title("Change {} {}".format(title_change_label, level)) subtitle = "" if not groupby else "{}: {}".format(groupby, level_name) ch.set_subtitle(subtitle) # Line at no difference ch.callout.line(0, orientation="height", line_color="black", line_dash="dashed") # ch.callout.text('No change', 0, .5, angle=90) ch.style.color_palette.reset_palette_order() for group in other_indx: posterior_mean = difference_posterior[group].mean() # Plot callout for the mean ch.callout.line( posterior_mean, orientation="height", line_color=ch.style.color_palette.next_color(), line_dash="dashed", ) ch.callout.text( f"Expected change: {posterior_mean:{format_str_precision(posterior_mean, absolute=False)}}", posterior_mean, 0, angle=90, ) # ch.callout.line( # potential_loss, # orientation='height', # line_color=ch.style.color_palette._colors[1]) # ch.callout.text( # 'Potential Loss: {0:.2f}%'.format(potential_loss * 100), # potential_loss, # 1.5, # angle=90) # ch.callout.text( # '{0:.2f}%'.format(potential_loss * 100), potential_loss, 1.) ch.set_source_label("") ch.axes.set_yaxis_range(0) ch.axes.set_xaxis_label(self.get_difference_plot_label(absolute)) ch.axes.set_yaxis_label("Probability Density") ch.axes.hide_yaxis() axis_format = axis_format_precision(df["values"].max() * 10, df["values"].min() * 10, absolute) ch.axes.set_xaxis_tick_format(axis_format) return ch @randomization_warning_decorator def _categorical_multiple_difference_plot(self, level, absolute, groupby, level_as_reference): """Pairwise comparison of the given level to all others. Args: level (str, tuple of str): Name of level. absolute (bool): If True then return the absolute difference otherwise return the relative difference. groupby (str): Name of column. If specified, will return an interval for each level of the grouped dimension. level_as_reference (bool): If True, the given level is the reference value for the change. (level1) Returns: GroupedChart object. """ results_df = self._iterate_groupby_to_chartgrid( self._categorical_multiple_difference_chart, groupby=groupby, level=level, absolute=absolute, level_as_reference=level_as_reference, ) return results_df # class GammaPoisson(PoissonResponse): # pass # class DirichetMultinomial(MultinomialResponse): # def __init__(self, # data_frame, # group_columns, # category_column, # value_column, # prior_value_column=None): # super().__init__(data_frame, group_columns, category_column, # value_column) # class Gaussian(GaussianResponse): # def __init__(self, # data_frame, # groupings, # mean_col, # std_col, # n_col, # time_grouping=None, # prior_columns=None): # self.prior_lambda_column = prior_lambda_column # super(BaseGaussianResponse, self).__init__( # data_frame, groups, mean_col, std_col, n_col, time_grouping) # raise (NotImplementedError) # class DirichetCategorical(CategoricalResponse): # pass