spotify_confidence/analysis/frequentist/confidence_computers/chi_squared_computer.py (68 lines of code) (raw):
from typing import Tuple, Dict
import numpy as np
from pandas import DataFrame, Series
from statsmodels.stats.proportion import proportion_confint, proportions_chisquare, confint_proportions_2indep
from spotify_confidence.analysis.confidence_utils import power_calculation
from spotify_confidence.analysis.constants import (
NUMERATOR,
DENOMINATOR,
INTERVAL_SIZE,
POINT_ESTIMATE,
VARIANCE,
CI_LOWER,
CI_UPPER,
SFX1,
SFX2,
)
def point_estimate(df: DataFrame, **kwargs: Dict[str, str]) -> float:
numerator = kwargs[NUMERATOR]
denominator = kwargs[DENOMINATOR]
if (df[denominator] == 0).any():
raise ValueError("""Can't compute point estimate: denominator is 0""")
return df[numerator] / df[denominator]
def variance(df: DataFrame, **kwargs: Dict[str, str]) -> Series:
variance = df[POINT_ESTIMATE] * (1 - df[POINT_ESTIMATE])
if (variance < 0).any():
raise ValueError(f"Computed variance is negative: {variance}. " "Please check your inputs.")
return variance
def std_err(df: DataFrame, **kwargs: Dict[str, str]) -> Series:
denominator = kwargs[DENOMINATOR]
return np.sqrt(df[VARIANCE + SFX1] / df[denominator + SFX1] + df[VARIANCE + SFX2] / df[denominator + SFX2])
def add_point_estimate_ci(df: DataFrame, **kwargs: Dict[str, str]) -> Series:
numerator = kwargs[NUMERATOR]
denominator = kwargs[DENOMINATOR]
interval_size = kwargs[INTERVAL_SIZE]
df[CI_LOWER], df[CI_UPPER] = proportion_confint(
count=df[numerator],
nobs=df[denominator],
alpha=1 - interval_size,
)
return df
def p_value(df: DataFrame, **kwargs: Dict[str, str]) -> Series:
n1, n2 = kwargs[NUMERATOR] + SFX1, kwargs[NUMERATOR] + SFX2
d1, d2 = kwargs[DENOMINATOR] + SFX1, kwargs[DENOMINATOR] + SFX2
def p_value_row(row):
_, p_value, _ = proportions_chisquare(
count=[row[n1], row[n2]],
nobs=[row[d1], row[d2]],
)
return p_value
return df.apply(p_value_row, axis=1)
def ci(df: DataFrame, alpha_column: str, **kwargs: Dict[str, str]) -> Tuple[Series, Series]:
n1, n2 = kwargs[NUMERATOR] + SFX1, kwargs[NUMERATOR] + SFX2
d1, d2 = kwargs[DENOMINATOR] + SFX1, kwargs[DENOMINATOR] + SFX2
return confint_proportions_2indep(
count1=df[n2],
nobs1=df[d2],
count2=df[n1],
nobs2=df[d1],
alpha=df[alpha_column],
compare="diff",
method="wald",
)
def achieved_power(df: DataFrame, mde: float, alpha: float, **kwargs: Dict[str, str]) -> DataFrame:
n1, n2 = kwargs[NUMERATOR] + SFX1, kwargs[NUMERATOR] + SFX2
d1, d2 = kwargs[DENOMINATOR] + SFX1, kwargs[DENOMINATOR] + SFX2
pooled_prop = (df[n1] + df[n2]) / (df[d1] + df[d2])
var_pooled = pooled_prop * (1 - pooled_prop)
return power_calculation(mde, var_pooled, alpha, df[d1], df[d2])