in sourcecode/scoring/incorrect_filter.py [0:0]
def get_user_incorrect_ratio(ratings: pd.DataFrame) -> pd.DataFrame:
"""Computes empirical p(incorrect | not helpful tags assigned) per rater.
Called during prescoring only, since it uses entire rating history.
Args:
ratings: DF containing ratings.
Returns:
pd.DataFrame containing one row per user who assigned not helpful tags with their empirical propensity
to assign "incorrect" tag
"""
# Filter down to just ratings with some nh tags used.
nhTagRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0]
user_incorrect = (
(
nhTagRatings[[c.raterParticipantIdKey, c.notHelpfulIncorrectTagKey]]
.groupby(c.raterParticipantIdKey)
.agg("sum")
)
.rename(columns={c.notHelpfulIncorrectTagKey: c.incorrectTagRatingsMadeByRaterKey})
.reset_index()
)
user_nh_rating_count = (
(
nhTagRatings[[c.raterParticipantIdKey, c.noteIdKey]]
.groupby(c.raterParticipantIdKey)
.agg("count")
)
.rename(columns={c.noteIdKey: c.totalRatingsMadeByRaterKey})
.reset_index()
)
user_totals = user_incorrect.merge(user_nh_rating_count, on=c.raterParticipantIdKey)
return user_totals