spotify_confidence/analysis/frequentist/experiment.py (243 lines of code) (raw):

# Copyright 2017-2020 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Union, Iterable, Tuple, Dict, List from pandas import DataFrame from spotify_confidence.analysis.frequentist.confidence_computers.confidence_computer import ConfidenceComputer from .chartify_grapher import ChartifyGrapher from ..abstract_base_classes.confidence_abc import ConfidenceABC from ..abstract_base_classes.confidence_computer_abc import ConfidenceComputerABC from ..abstract_base_classes.confidence_grapher_abc import ConfidenceGrapherABC from ..confidence_utils import ( validate_categorical_columns, listify, get_all_categorical_group_columns, get_all_group_columns, ) from ..constants import BONFERRONI, NIM_TYPE, METHODS from ..frequentist.sample_ratio_test import sample_ratio_test from ...chartgrid import ChartGrid class Experiment(ConfidenceABC): """ This class represents an experiment which might include several metrics and treatment groups, as well as other dimensions to break down the results by. It provides several methods to analyze and visualize the results of the experiment. The experiment is based on data provided as a DataFrame with sufficient statistics, such as information the numerator, and denominator (number of units in the experiment) of the tested metrics and their grouping columns. Attributes: data_frame (DataFrame): DataFrame that contains the experimental data. numerator_column (str): Name of the DataFrame column that contains the numerator of the tested metric. numerator_sum_squares_column (Union[str, None]): Name of the DataFrame column that contains the sum of squares of the numerator. denominator_column (str): Name of the DataFrame column that contains the denominator of the tested metric. categorical_group_columns (Union[str, Iterable]): Column(s) that categorically group the data. ordinal_group_column (Union[str, None]): Column that ordinally group the data. Values need to be of types int or datetime interval_size (float): Size of the confidence interval. Defaults to 0.95. correction_method (str): Method for correction. Defaults to "bonferroni". confidence_computer (ConfidenceComputerABC): ConfidenceComputerABC object to compute confidence intervals. confidence_grapher (ConfidenceGrapherABC): ConfidenceGrapherABC object to plot confidence intervals. method_column (str): Column that contains the statistical test method, e.g. "z-test", "t-test". bootstrap_samples_column (str): Column that contains the bootstrap samples when method is "bootstrap". metric_column (str): Column that contain the names of metrics. treatment_column (str): Column that contains the names of treatment and control groups. power (float): Desired statistical power. Defaults to 0.8. feature_column (str): Column that contains the features when method is "z-test-linreg". feature_sum_squares_column (str): Column that contains the sum of squares of the features method is "z-test-linreg". feature_cross_sum_column (str): Column that contains the cross product sum of the features method is "z-test-linreg". """ def __init__( self, data_frame: DataFrame, numerator_column: str, numerator_sum_squares_column: Union[str, None], denominator_column: str, categorical_group_columns: Union[str, Iterable], ordinal_group_column: Union[str, None] = None, interval_size: float = 0.95, correction_method: str = BONFERRONI, confidence_computer: ConfidenceComputerABC = None, confidence_grapher: ConfidenceGrapherABC = None, method_column: str = None, bootstrap_samples_column: str = None, metric_column=None, treatment_column=None, power: float = 0.8, feature_column: str = None, feature_sum_squares_column: str = None, feature_cross_sum_column: str = None, ): validate_categorical_columns(categorical_group_columns) self._df = data_frame self._numerator = numerator_column self._numerator_sumsq = numerator_sum_squares_column self._denominator = denominator_column self._categorical_group_columns = get_all_categorical_group_columns( categorical_group_columns, metric_column, treatment_column ) self._ordinal_group_column = ordinal_group_column self._metric_column = metric_column self._treatment_column = treatment_column self._all_group_columns = get_all_group_columns(self._categorical_group_columns, self._ordinal_group_column) if method_column is None: raise ValueError("method column cannot be None") if not all(self._df[method_column].map(lambda m: m in METHODS)): raise ValueError(f"Values of method column must be in {METHODS}") if confidence_computer is not None: self._confidence_computer = confidence_computer else: self._confidence_computer = ConfidenceComputer( data_frame=data_frame, numerator_column=numerator_column, numerator_sum_squares_column=numerator_sum_squares_column, denominator_column=denominator_column, categorical_group_columns=listify(categorical_group_columns), ordinal_group_column=ordinal_group_column, interval_size=interval_size, correction_method=correction_method.lower(), method_column=method_column, bootstrap_samples_column=bootstrap_samples_column, metric_column=metric_column, treatment_column=treatment_column, power=power, feature_column=feature_column, feature_sum_squares_column=feature_sum_squares_column, feature_cross_sum_column=feature_cross_sum_column, ) self._confidence_grapher = ( confidence_grapher if confidence_grapher is not None else ChartifyGrapher( data_frame=self._df, numerator_column=self._numerator, denominator_column=self._denominator, categorical_group_columns=self._categorical_group_columns, ordinal_group_column=self._ordinal_group_column, ) ) def summary(self, verbose: bool = False) -> DataFrame: return self._confidence_computer.compute_summary(verbose) def difference( self, level_1: Union[str, Tuple], level_2: Union[str, Tuple], absolute: bool = True, groupby: Union[str, Iterable] = None, non_inferiority_margins: NIM_TYPE = None, final_expected_sample_size_column: str = None, verbose: bool = False, minimum_detectable_effects_column: str = None, ) -> DataFrame: self._validate_sequential(final_expected_sample_size_column, groupby) return self._confidence_computer.compute_difference( level_1, level_2, absolute, groupby, non_inferiority_margins, final_expected_sample_size_column, verbose, minimum_detectable_effects_column, ) def differences( self, levels: Union[Tuple, List[Tuple]], absolute: bool = True, groupby: Union[str, Iterable] = None, non_inferiority_margins: NIM_TYPE = None, final_expected_sample_size_column: str = None, verbose: bool = False, minimum_detectable_effects_column: str = None, ) -> DataFrame: self._validate_sequential(final_expected_sample_size_column, groupby) return self._confidence_computer.compute_differences( levels, absolute, groupby, non_inferiority_margins, final_expected_sample_size_column, verbose, minimum_detectable_effects_column, ) def multiple_difference( self, level: Union[str, Tuple], absolute: bool = True, groupby: Union[str, Iterable] = None, level_as_reference: bool = None, non_inferiority_margins: NIM_TYPE = None, final_expected_sample_size_column: str = None, verbose: bool = False, minimum_detectable_effects_column: str = None, ) -> DataFrame: self._validate_sequential(final_expected_sample_size_column, groupby) return self._confidence_computer.compute_multiple_difference( level, absolute, groupby, level_as_reference, non_inferiority_margins, final_expected_sample_size_column, verbose, minimum_detectable_effects_column, ) def summary_plot(self, groupby: Union[str, Iterable] = None) -> ChartGrid: summary_df = self.summary() graph = self._confidence_grapher.plot_summary(summary_df, groupby) return graph def difference_plot( self, level_1: Union[str, Tuple], level_2: Union[str, Tuple], absolute: bool = True, groupby: Union[str, Iterable] = None, non_inferiority_margins: NIM_TYPE = None, use_adjusted_intervals: bool = False, final_expected_sample_size_column: str = None, split_plot_by_groups: bool = False, ) -> ChartGrid: difference_df = self.difference( level_1=level_1, level_2=level_2, absolute=absolute, groupby=groupby, non_inferiority_margins=non_inferiority_margins, final_expected_sample_size_column=final_expected_sample_size_column, verbose=True, ) chartgrid = self._confidence_grapher.plot_difference( difference_df, absolute, groupby, non_inferiority_margins, use_adjusted_intervals, split_plot_by_groups ) return chartgrid def differences_plot( self, levels: List[Tuple], absolute: bool = True, groupby: Union[str, Iterable] = None, non_inferiority_margins: NIM_TYPE = None, use_adjusted_intervals: bool = False, final_expected_sample_size_column: str = None, split_plot_by_groups: bool = False, ) -> ChartGrid: difference_df = self.differences( levels, absolute, groupby, non_inferiority_margins, final_expected_sample_size_column, verbose=True ) chartgrid = self._confidence_grapher.plot_differences( difference_df, absolute, groupby, non_inferiority_margins, use_adjusted_intervals, split_plot_by_groups ) return chartgrid def multiple_difference_plot( self, level: Union[str, Tuple], absolute: bool = True, groupby: Union[str, Iterable] = None, level_as_reference: bool = None, non_inferiority_margins: NIM_TYPE = None, use_adjusted_intervals: bool = False, final_expected_sample_size_column: str = None, split_plot_by_groups: bool = False, ) -> ChartGrid: difference_df = self.multiple_difference( level=level, absolute=absolute, groupby=groupby, level_as_reference=level_as_reference, non_inferiority_margins=non_inferiority_margins, final_expected_sample_size_column=final_expected_sample_size_column, verbose=True, ) chartgrid = self._confidence_grapher.plot_multiple_difference( difference_df, absolute, groupby, level_as_reference, non_inferiority_margins, use_adjusted_intervals, split_plot_by_groups, ) return chartgrid def sample_ratio_test(self, expected_proportions: Dict) -> Tuple[float, DataFrame]: return sample_ratio_test( self._df, all_group_columns=self._all_group_columns, denominator=self._denominator, expected_proportions=expected_proportions, ) def achieved_power(self, level_1, level_2, mde, alpha, groupby=None) -> DataFrame: """Calculated the achieved power of test of differences between level 1 and level 2 given a targeted MDE. Args: level_1 (str, tuple of str): Name of first level. level_2 (str, tuple of str): Name of second level. mde (float): Absolute minimal detectable effect size. alpha (float): Type I error rate, cutoff value for determining statistical significance. groupby (str): Name of column. If specified, will return the difference for each level of the grouped dimension. Returns: Pandas DataFrame with the following columns: - level_1: Name of level 1. - level_2: Name of level 2. - power: 1 - B, where B is the likelihood of a Type II (false negative) error. """ return self._confidence_computer.achieved_power(level_1, level_2, mde, alpha, groupby) def _validate_sequential(self, final_expected_sample_size: float, groupby: Union[str, Iterable]): if final_expected_sample_size is not None: if self._ordinal_group_column not in listify(groupby): raise ValueError( f"{self._ordinal_group_column} must be in groupby argument to use " f"sequential testing with final_expected_sample_size" )