in sourcecode/scoring/post_selection_similarity_old.py [0:0]
def _get_pair_counts_df_dict(ratings, windowMillis):
import numpy as np
import pandas as pd
from collections import defaultdict
# Assign column keys to local variables for faster access
noteIdKey = c.noteIdKey
createdAtMillisKey = c.createdAtMillisKey
raterParticipantIdKey = c.raterParticipantIdKey
# Sort ratings by noteIdKey and createdAtMillisKey
ratings_sorted = ratings.sort_values([noteIdKey, createdAtMillisKey])
# Initialize a defaultdict to store counts of pairs
pair_counts = defaultdict(int)
# Group by noteIdKey to process each note individually
grouped = ratings_sorted.groupby(noteIdKey, sort=False)
for noteId, group in grouped:
# Extract relevant columns as numpy arrays for efficient computation
times = group[createdAtMillisKey].values
raters = group[raterParticipantIdKey].values
n = len(group)
window_start = 0
for i in range(n):
# Move the window start forward if the time difference exceeds windowMillis
while times[i] - times[window_start] > windowMillis:
window_start += 1
# For all indices within the sliding window (excluding the current index)
for j in range(window_start, i):
if raters[i] != raters[j]:
left_rater, right_rater = tuple(sorted((raters[i], raters[j])))
# Update the count for this pair
pair_counts[(left_rater, right_rater)] += 1
# Convert the pair_counts dictionary to a DataFrame
if pair_counts:
pairs = np.array(list(pair_counts.keys()))
counts = np.array(list(pair_counts.values()))
df = pd.DataFrame({
'leftRaterId': pairs[:, 0],
'rightRaterId': pairs[:, 1],
'pairRatings': counts
})
else:
# Return an empty DataFrame with appropriate columns
df = pd.DataFrame(columns=['leftRaterId', 'rightRaterId', 'pairRatings'])
return df