in sourcecode/scoring/post_selection_similarity.py [0:0]
def _get_pair_counts_dict(ratings, windowMillis):
pair_counts = dict()
# Group by tweetIdKey to process each tweet individually
grouped_by_tweet = ratings.groupby(c.tweetIdKey, sort=False)
for _, tweet_group in grouped_by_tweet:
# Keep track of pairs we've already counted for this tweetId
pairs_counted_in_tweet = set()
# Group by noteIdKey within the tweet
grouped_by_note = tweet_group.groupby(c.noteIdKey, sort=False)
for _, note_group in grouped_by_note:
note_group.sort_values(c.createdAtMillisKey, inplace=True)
# Extract relevant columns as numpy arrays for efficient computation
times = note_group[c.createdAtMillisKey].values
raters = note_group[c.raterParticipantIdKey].values
n = len(note_group)
window_start = 0
for i in range(n):
# Move the window start forward if the time difference exceeds windowMillis
while times[i] - times[window_start] > windowMillis:
window_start += 1
# For all indices within the sliding window (excluding the current index)
for j in range(window_start, i):
if raters[i] != raters[j]:
left_rater, right_rater = tuple(sorted((raters[i], raters[j])))
pair = (left_rater, right_rater)
# Only count this pair once per tweetId
if pair not in pairs_counted_in_tweet:
pairs_counted_in_tweet.add(pair)
# Update the count for this pair
if pair not in pair_counts:
pair_counts[pair] = 0
pair_counts[pair] += 1
return pair_counts