in sourcecode/scoring/note_ratings.py [0:0]
def compute_note_stats(ratings: pd.DataFrame, noteStatusHistory: pd.DataFrame) -> pd.DataFrame:
"""Compute aggregate note statics over available ratings and merge in noteStatusHistory fields.
This function computes note aggregates over ratings and then merges additional fields from
noteStatusHistory. In general, we do not expect that every note in noteStatusHistory will
also appear in ratings (e.g. some notes have no ratings) so the aggregate values for some
notes will be NaN. We do expect that all notes observed in ratings will appear in
noteStatusHistory, and verify that expectation with an assert.
Note that the content of both ratings and noteStatusHistory may vary across callsites. For
example:
* Scoring models operating on subsets of notes and ratings may pre-filter both
ratings and noteStatusHistory to only include notes/ratings that are in-scope.
* During meta scoring we may invoke compute_note_stats with the full set of ratings
and notes to compute note stats supporting contributor helpfulness aggregates.
Args:
ratings (pd.DataFrame): all ratings
noteStatusHistory (pd.DataFrame): history of note statuses
Returns:
pd.DataFrame containing stats about each note
"""
last28Days = (
1000
* (
datetime.fromtimestamp(c.epochMillis / 1000, tz=timezone.utc)
- timedelta(days=c.emergingWriterDays)
).timestamp()
)
ratingsToUse = pd.DataFrame(
ratings[[c.noteIdKey] + c.helpfulTagsTSVOrder + c.notHelpfulTagsTSVOrder]
)
ratingsToUse.loc[:, c.numRatingsKey] = 1
ratingsToUse.loc[:, c.numRatingsLast28DaysKey] = False
ratingsToUse.loc[ratings[c.createdAtMillisKey] > last28Days, c.numRatingsLast28DaysKey] = True
noteStats = ratingsToUse.groupby(c.noteIdKey).sum()
noteStats = noteStats.merge(
noteStatusHistory[
[
c.noteIdKey,
c.createdAtMillisKey,
c.noteAuthorParticipantIdKey,
c.classificationKey,
c.currentLabelKey,
c.lockedStatusKey,
]
],
on=c.noteIdKey,
how="outer",
unsafeAllowed=set(
[
c.numRatingsKey,
c.numRatingsLast28DaysKey,
]
+ c.helpfulTagsTSVOrder
+ c.notHelpfulTagsTSVOrder
),
)
# Fill in nan values resulting from the outer merge with zero since these values were not
# present during aggregation.
columns = [
c.numRatingsKey,
c.numRatingsLast28DaysKey,
] + (c.helpfulTagsTSVOrder + c.notHelpfulTagsTSVOrder)
noteStats = noteStats.fillna({col: 0 for col in columns})
noteStats[columns] = noteStats[columns].astype(np.int64)
# Validate that notes in ratings were a subset of noteStatusHistory.
assert len(noteStats) == len(noteStatusHistory), "noteStatusHistory should contain all notes"
return noteStats