spotify_confidence/analysis/frequentist/chartify_grapher.py (579 lines of code) (raw):

# Copyright 2017-2020 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Union, Iterable, Tuple import numpy as np from bokeh.models import tools from chartify import Chart from pandas import DataFrame, concat from ..abstract_base_classes.confidence_grapher_abc import ConfidenceGrapherABC from ..confidence_utils import ( axis_format_precision, add_color_column, get_remaning_groups, get_all_group_columns, listify, level2str, to_finite, de_list_if_length_one, ) from ..constants import ( POINT_ESTIMATE, ORIGINAL_POINT_ESTIMATE, DIFFERENCE, CI_LOWER, CI_UPPER, P_VALUE, ADJUSTED_LOWER, ADJUSTED_UPPER, ADJUSTED_P, NULL_HYPOTHESIS, NIM, NIM_TYPE, PREFERENCE, SFX1, ) from ...chartgrid import ChartGrid class ChartifyGrapher(ConfidenceGrapherABC): def __init__( self, data_frame: DataFrame, numerator_column: str, denominator_column: str, categorical_group_columns: str, ordinal_group_column: str, ): self._df = data_frame self._numerator = numerator_column self._denominator = denominator_column self._categorical_group_columns = categorical_group_columns self._ordinal_group_column = ordinal_group_column self._all_group_columns = get_all_group_columns(self._categorical_group_columns, self._ordinal_group_column) def plot_summary(self, summary_df: DataFrame, groupby: Union[str, Iterable]) -> ChartGrid: ch = ChartGrid() if groupby is None: ch.charts.append(self._summary_plot(level_name=None, level_df=summary_df, groupby=groupby)) else: for level_name, level_df in summary_df.groupby(de_list_if_length_one(groupby)): ch.charts.append(self._summary_plot(level_name=level_name, level_df=level_df, groupby=groupby)) return ch def plot_difference( self, difference_df, absolute, groupby, nims: NIM_TYPE, use_adjusted_intervals: bool, split_plot_by_groups: bool, ) -> ChartGrid: ch = ChartGrid() categorical_groups = get_remaning_groups(listify(groupby), self._ordinal_group_column) if len(categorical_groups) == 0 or not split_plot_by_groups: ch.charts += self.plot_differece_group(absolute, difference_df, groupby, use_adjusted_intervals).charts else: for level_name, level_df in difference_df.groupby(categorical_groups): ch.charts += self.plot_differece_group(absolute, level_df, groupby, use_adjusted_intervals).charts return ch def plot_differece_group(self, absolute, difference_df, groupby, use_adjusted_intervals): if self._ordinal_group_column in listify(groupby): ch = self._ordinal_difference_plot(difference_df, absolute, groupby, use_adjusted_intervals) chart_grid = ChartGrid([ch]) else: chart_grid = self._categorical_difference_plot(difference_df, absolute, groupby, use_adjusted_intervals) return chart_grid def plot_differences( self, difference_df, absolute, groupby, nims: NIM_TYPE, use_adjusted_intervals: bool, split_plot_by_groups: bool, ) -> ChartGrid: ch = ChartGrid() categorical_groups = get_remaning_groups(listify(groupby), self._ordinal_group_column) if len(categorical_groups) == 0 or not split_plot_by_groups: ch.charts += self.plot_differences_group(absolute, difference_df, groupby, use_adjusted_intervals).charts else: for level_name, level_df in difference_df.groupby(categorical_groups): ch.charts += self.plot_differences_group(absolute, level_df, groupby, use_adjusted_intervals).charts return ch def plot_differences_group(self, absolute, difference_df, groupby, use_adjusted_intervals): categorical_groups = get_remaning_groups(groupby, self._ordinal_group_column) groupby_columns = self._add_level_columns(categorical_groups) if self._ordinal_group_column in listify(groupby): ch = self._ordinal_difference_plot(difference_df, absolute, groupby_columns, use_adjusted_intervals) chart_grid = ChartGrid([ch]) else: chart_grid = self._categorical_difference_plot( difference_df, absolute, groupby_columns, use_adjusted_intervals ) return chart_grid def plot_multiple_difference( self, difference_df, absolute, groupby, level_as_reference, nims: NIM_TYPE, use_adjusted_intervals: bool, split_plot_by_groups: bool, ) -> ChartGrid: ch = ChartGrid() categorical_groups = get_remaning_groups(listify(groupby), self._ordinal_group_column) groupby = de_list_if_length_one(groupby) if len(categorical_groups) == 0 or not split_plot_by_groups: ch.charts += self.plot_multiple_difference_group( absolute, difference_df, groupby, level_as_reference, use_adjusted_intervals ).charts else: for level_name, level_df in difference_df.groupby(de_list_if_length_one(categorical_groups)): ch.charts += self.plot_multiple_difference_group( absolute, level_df, groupby, level_as_reference, use_adjusted_intervals ).charts return ch def plot_multiple_difference_group( self, absolute, difference_df, groupby, level_as_reference, use_adjusted_intervals ): if self._ordinal_group_column in listify(groupby): ch = self._ordinal_multiple_difference_plot( difference_df, absolute, groupby, level_as_reference, use_adjusted_intervals ) chart_grid = ChartGrid([ch]) else: chart_grid = self._categorical_multiple_difference_plot( difference_df, absolute, groupby, level_as_reference, use_adjusted_intervals ) return chart_grid def _ordinal_difference_plot( self, difference_df: DataFrame, absolute: bool, groupby: Union[str, Iterable], use_adjusted_intervals: bool ) -> Chart: remaining_groups = get_remaning_groups(groupby, self._ordinal_group_column) if "level_1" in groupby and "level_2" in groupby: title = "Change from level_1 to level_2" else: title = "Change from {} to {}".format( difference_df["level_1"].values[0], difference_df["level_2"].values[0] ) y_axis_label = self._get_difference_plot_label(absolute) ch = self._ordinal_plot( DIFFERENCE, difference_df, groupby=None, level_name="", remaining_groups=remaining_groups, absolute=absolute, title=title, y_axis_label=y_axis_label, use_adjusted_intervals=use_adjusted_intervals, ) ch.callout.line(0) return ch def _get_difference_plot_label(self, absolute): change_type = "Absolute" if absolute else "Relative" return change_type + " change in {} / {}".format(self._numerator, self._denominator) def _categorical_difference_plot( self, difference_df: DataFrame, absolute: bool, groupby: Union[str, Iterable], use_adjusted_intervals: bool ) -> ChartGrid: if groupby is None: groupby = "dummy_groupby" difference_df.loc[:, groupby] = "Difference" if "level_1" in groupby and "level_2" in groupby: title = "Change from level_1 to level_2" else: title = "Change from {} to {}".format( difference_df["level_1"].values[0], difference_df["level_2"].values[0] ) x_label = "" if groupby is None else "{}".format(groupby) chart_grid = self._categorical_difference_chart( absolute, difference_df, groupby, title, x_label, use_adjusted_intervals ) return chart_grid def _categorical_difference_chart( self, absolute: bool, difference_df: DataFrame, groupby_columns: Union[str, Iterable], title: str, x_label: str, use_adjusted_intervals: bool, ) -> ChartGrid: LOWER, UPPER = (ADJUSTED_LOWER, ADJUSTED_UPPER) if use_adjusted_intervals else (CI_LOWER, CI_UPPER) axis_format, y_min, y_max = axis_format_precision( numbers=concat( [ difference_df[LOWER], difference_df[DIFFERENCE], difference_df[UPPER], difference_df[NULL_HYPOTHESIS] if NULL_HYPOTHESIS in difference_df.columns else None, ], ), absolute=absolute, ) df = ( difference_df.assign(**{LOWER: to_finite(difference_df[LOWER], y_min, y_max)}) .assign(**{UPPER: to_finite(difference_df[UPPER], y_min, y_max)}) .assign(level_1=difference_df.level_1.map(level2str)) .assign(level_2=difference_df.level_2.map(level2str)) .set_index(groupby_columns) .assign(categorical_x=lambda df: df.index.to_numpy()) .reset_index() ) ch = Chart(x_axis_type="categorical") ch.plot.interval( data_frame=df.sort_values(groupby_columns), categorical_columns=groupby_columns, lower_bound_column=LOWER, upper_bound_column=UPPER, middle_column=DIFFERENCE, categorical_order_by="labels", categorical_order_ascending=False, ) # Also plot transparent circles, just to be able to show hover box ch.style.color_palette.reset_palette_order() ch.figure.circle( source=df, x="categorical_x", y=DIFFERENCE, size=20, name="center", line_alpha=0, fill_alpha=0 ) if NULL_HYPOTHESIS in df.columns and df[NIM].notna().any(): ch.style.color_palette.reset_palette_order() dash_source = ( df[~df[NIM].isna()] .assign( color_column=lambda df: df.apply( lambda row: ( "red" if row[LOWER] < row[NULL_HYPOTHESIS] and row[PREFERENCE] == "increase" or row[NULL_HYPOTHESIS] < row[UPPER] and row[PREFERENCE] == "decrease" else "green" ), axis=1, ) ) .sort_values(groupby_columns) ) ch.figure.dash( source=dash_source, x="categorical_x", y=NULL_HYPOTHESIS, size=320 / len(df), line_width=3, name="nim", line_color="color_column", ) ch.axes.set_yaxis_label(self._get_difference_plot_label(absolute)) ch.set_source_label("") ch.callout.line(0) ch.axes.set_yaxis_range(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) ch.axes.set_yaxis_tick_format(axis_format) ch.set_title(title) ch.axes.set_xaxis_label(x_label) ch.set_subtitle("") self.add_tools( chart=ch, df=( difference_df.set_index(groupby_columns) .assign(categorical_x=lambda df: df.index.to_numpy()) .reset_index() ), center_name=DIFFERENCE, absolute=absolute, ordinal=False, use_adjusted_intervals=use_adjusted_intervals, ) chart_grid = ChartGrid() chart_grid.charts.append(ch) return chart_grid def _summary_plot(self, level_name: Union[str, Tuple], level_df: DataFrame, groupby: Union[str, Iterable]): remaining_groups = get_remaning_groups(self._all_group_columns, groupby) if self._ordinal_group_column is not None and self._ordinal_group_column in remaining_groups: ch = self._ordinal_summary_plot(level_name, level_df, remaining_groups, groupby) else: ch = self._categorical_summary_plot(level_name, level_df, remaining_groups, groupby) return ch def _ordinal_summary_plot( self, level_name: Union[str, Tuple], level_df: DataFrame, remaining_groups: Union[str, Iterable], groupby: Union[str, Iterable], ): remaining_groups = get_remaning_groups(remaining_groups, self._ordinal_group_column) title = "Estimate of {} / {}".format(self._numerator, self._denominator) y_axis_label = "{} / {}".format(self._numerator, self._denominator) return self._ordinal_plot( POINT_ESTIMATE, level_df, groupby, level_name, remaining_groups, absolute=True, title=title, y_axis_label=y_axis_label, use_adjusted_intervals=False, ) def _ordinal_plot( self, center_name: str, level_df: DataFrame, groupby: Union[str, Iterable], level_name: Union[str, Tuple], remaining_groups: Union[str, Iterable], absolute: bool, title: str, y_axis_label: str, use_adjusted_intervals: bool, ): LOWER, UPPER = (ADJUSTED_LOWER, ADJUSTED_UPPER) if use_adjusted_intervals else (CI_LOWER, CI_UPPER) df = add_color_column(level_df, remaining_groups) colors = "color" if remaining_groups else None axis_format, y_min, y_max = axis_format_precision( numbers=concat( [df[LOWER], df[center_name], df[UPPER], df[NULL_HYPOTHESIS] if NULL_HYPOTHESIS in df.columns else None] ), absolute=absolute, ) ch = Chart(x_axis_type=self._ordinal_type()) ch.plot.line( data_frame=df.sort_values(self._ordinal_group_column), x_column=self._ordinal_group_column, y_column=center_name, color_column=colors, ) # Also plot transparent circles, just to be able to show hover box ch.figure.line(source=df, x=self._ordinal_group_column, y=center_name, name="center", line_alpha=0) ch.style.color_palette.reset_palette_order() ch.plot.area( data_frame=( df.assign(**{LOWER: to_finite(df[LOWER], y_min, y_max)}) .assign(**{UPPER: to_finite(df[UPPER], y_min, y_max)}) .sort_values(self._ordinal_group_column) ), x_column=self._ordinal_group_column, y_column=LOWER, second_y_column=UPPER, color_column=colors, ) if NULL_HYPOTHESIS in df.columns: ch.style.color_palette.reset_palette_order() ch.plot.line( data_frame=df.sort_values(self._ordinal_group_column), x_column=self._ordinal_group_column, y_column=NULL_HYPOTHESIS, color_column=colors, line_dash="dashed", line_width=1, ) # Also plot named transparent line, just to be able to show hover box ch.figure.line( source=df.sort_values(self._ordinal_group_column), x=self._ordinal_group_column, y=NULL_HYPOTHESIS, line_width=3, line_alpha=0, name="nim", ) ch.axes.set_yaxis_label(y_axis_label) ch.axes.set_xaxis_label(self._ordinal_group_column) ch.set_source_label("") ch.axes.set_yaxis_range(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)) ch.axes.set_yaxis_tick_format(axis_format) subtitle = "" if not groupby else "{}: {}".format(groupby, level_name) ch.set_subtitle(subtitle) ch.set_title(title) if colors: ch.set_legend_location("outside_bottom") self.add_tools( chart=ch, df=df, center_name=center_name, absolute=absolute, ordinal=True, use_adjusted_intervals=use_adjusted_intervals, ) return ch def _categorical_summary_plot(self, level_name, summary_df, remaining_groups, groupby): if not remaining_groups: remaining_groups = listify(groupby) df = summary_df.set_index(remaining_groups).assign(categorical_x=lambda df: df.index.to_numpy()).reset_index() axis_format, y_min, y_max = axis_format_precision( numbers=concat([df[CI_LOWER], df[POINT_ESTIMATE], df[CI_UPPER]]), absolute=True ) ch = Chart(x_axis_type="categorical") ch.plot.interval( ( df.assign(**{CI_LOWER: to_finite(df[CI_LOWER], y_min, y_max)}).assign( **{CI_UPPER: to_finite(df[CI_UPPER], y_min, y_max)} ) ), categorical_columns=remaining_groups, lower_bound_column=CI_LOWER, upper_bound_column=CI_UPPER, middle_column=POINT_ESTIMATE, categorical_order_by="labels", categorical_order_ascending=True, ) # Also plot transparent circles, just to be able to show hover box ch.style.color_palette.reset_palette_order() ch.figure.circle( source=df, x="categorical_x", y=POINT_ESTIMATE, size=20, name="center", line_alpha=0, fill_alpha=0 ) ch.set_title("Estimate of {} / {}".format(self._numerator, self._denominator)) if groupby: ch.set_subtitle("{}: {}".format(groupby, level_name)) else: ch.set_subtitle("") ch.axes.set_xaxis_label("{}".format(", ".join(remaining_groups))) ch.axes.set_yaxis_label("{} / {}".format(self._numerator, self._denominator)) ch.set_source_label("") ch.axes.set_yaxis_tick_format(axis_format) self.add_tools( chart=ch, df=df, center_name=POINT_ESTIMATE, absolute=True, ordinal=False, use_adjusted_intervals=False ) return ch def _ordinal_type(self): ordinal_column_type = self._df[self._ordinal_group_column].dtype.type axis_type = "datetime" if issubclass(ordinal_column_type, np.datetime64) else "linear" return axis_type def _ordinal_multiple_difference_plot( self, difference_df: DataFrame, absolute: bool, groupby: Union[str, Iterable], level_as_reference: bool, use_adjusted_intervals: bool, ): remaining_groups = get_remaning_groups(groupby, self._ordinal_group_column) groupby_columns = self._add_level_column(remaining_groups, level_as_reference) title = self._get_multiple_difference_title(difference_df, level_as_reference) y_axis_label = self._get_difference_plot_label(absolute) ch = self._ordinal_plot( DIFFERENCE, difference_df, groupby=None, level_name="", remaining_groups=groupby_columns, absolute=absolute, title=title, y_axis_label=y_axis_label, use_adjusted_intervals=use_adjusted_intervals, ) ch.callout.line(0) return ch def _categorical_multiple_difference_plot( self, difference_df: DataFrame, absolute: bool, groupby: Union[str, Iterable], level_as_reference: bool, use_adjusted_intervals: bool, ): groupby_columns = self._add_level_column(groupby, level_as_reference) title = self._get_multiple_difference_title(difference_df, level_as_reference) x_label = "" if groupby is None else "{}".format(groupby) chart_grid = self._categorical_difference_chart( absolute, difference_df, groupby_columns, title, x_label, use_adjusted_intervals ) return chart_grid def _get_multiple_difference_title(self, difference_df, level_as_reference): reference_level = "level_1" if level_as_reference else "level_2" title = "Comparison to {}".format(difference_df[reference_level].values[0]) return title def _add_level_column(self, groupby, level_as_reference): level_column = "level_2" if level_as_reference else "level_1" if groupby is None: groupby_columns = level_column else: if isinstance(groupby, str): groupby_columns = [groupby, level_column] else: groupby_columns = groupby + [level_column] return groupby_columns def _add_level_columns(self, groupby): levels = ["level_1", "level_2"] if groupby is None: groupby_columns = levels else: if isinstance(groupby, str): groupby_columns = [groupby] + levels else: groupby_columns = groupby + levels return groupby_columns def add_ci_to_chart_datasources( self, chart: Chart, df: DataFrame, center_name: str, ordinal: bool, use_adjusted_intervals: bool ): LOWER, UPPER = (ADJUSTED_LOWER, ADJUSTED_UPPER) if use_adjusted_intervals else (CI_LOWER, CI_UPPER) group_col = "color" if ordinal and "color" in df.columns else "categorical_x" for data in chart.data: if center_name in data.keys() or NULL_HYPOTHESIS in data.keys(): index = data["index"] data[LOWER] = np.array(df[LOWER][index]) data[UPPER] = np.array(df[UPPER][index]) data["color"] = np.array(df[group_col][index]) if DIFFERENCE in data.keys() or NULL_HYPOTHESIS in data.keys(): index = data["index"] data["reference_level"] = np.array(df["level_1"][index]) data[DIFFERENCE] = np.array(df[DIFFERENCE][index]) data["p_value"] = np.array(df[P_VALUE][index]) data["adjusted_p"] = np.array(df[ADJUSTED_P][index]) data["reference_level_avg"] = np.array(df[ORIGINAL_POINT_ESTIMATE + SFX1][index]) if NULL_HYPOTHESIS in df.columns: data["null_hyp"] = np.array(df[NULL_HYPOTHESIS][index]) def add_tools( self, chart: Chart, df: DataFrame, center_name: str, absolute: bool, ordinal: bool, use_adjusted_intervals: bool, ): self.add_ci_to_chart_datasources(chart, df, center_name, ordinal, use_adjusted_intervals) LOWER, UPPER = (ADJUSTED_LOWER, ADJUSTED_UPPER) if use_adjusted_intervals else (CI_LOWER, CI_UPPER) if len(chart.figure.legend) > 0: chart.figure.legend.click_policy = "hide" axis_format, y_min, y_max = axis_format_precision( numbers=concat( [df[LOWER], df[center_name], df[UPPER], df[NULL_HYPOTHESIS] if NULL_HYPOTHESIS in df.columns else None] ), absolute=absolute, extra_zeros=2, ) axis_format_reference_level, _, _ = axis_format_precision( numbers=concat( [df[LOWER], df[center_name], df[UPPER], df[NULL_HYPOTHESIS] if NULL_HYPOTHESIS in df.columns else None] ), absolute=True, extra_zeros=2, ) ordinal_tool_tip = [] if not ordinal else [(self._ordinal_group_column, f"@{self._ordinal_group_column}")] p_value_tool_tip = ( ( [("p-value", "@p_value{0.0000}")] + ([("adjusted p-value", "@adjusted_p{0.0000}")] if len(df) > 1 else []) ) if center_name == DIFFERENCE else [] ) nim_tool_tip = [("null hypothesis", f"@null_hyp{{{axis_format}}}")] if NULL_HYPOTHESIS in df.columns else [] reference_level_tool_tip = ( [("reference level", f"@reference_level: @reference_level_avg{{{axis_format_reference_level}}}")] if "level_1" in df.columns else [] ) tooltips = ( [("group", "@color")] + reference_level_tool_tip + ordinal_tool_tip + [(f"{center_name}", f"@{center_name}{{{axis_format}}}")] + [ ( ("adjusted " if use_adjusted_intervals else "") + "confidence interval", f"(@{{{LOWER}}}{{{axis_format}}}," f" @{{{UPPER}}}{{{axis_format}}})", ) ] + p_value_tool_tip + nim_tool_tip ) lines_with_hover = ["center", "nim"] renderers = [r for r in chart.figure.renderers if r.name in lines_with_hover] hover = tools.HoverTool(tooltips=tooltips, renderers=renderers) box_zoom = tools.BoxZoomTool() chart.figure.add_tools( hover, tools.ZoomInTool(), tools.ZoomOutTool(), box_zoom, tools.PanTool(), tools.ResetTool() ) chart.figure.toolbar.active_drag = box_zoom