in sourcecode/scoring/post_selection_similarity_old.py [0:0]
def _get_pair_ratings_df_optimized(ratings, windowMillis):
# Assign column keys to local variables for faster access
noteIdKey = c.noteIdKey
createdAtMillisKey = c.createdAtMillisKey
raterParticipantIdKey = c.raterParticipantIdKey
tweetIdKey = c.tweetIdKey
# Sort ratings by noteIdKey and createdAtMillisKey
ratings_sorted = ratings.sort_values([noteIdKey, createdAtMillisKey])
# Initialize lists to collect data
left_raters = []
right_raters = []
tweet_ids = []
# Group by noteIdKey to process each note individually
grouped = ratings_sorted.groupby(noteIdKey, sort=False)
for noteId, group in grouped:
# Extract relevant columns as numpy arrays for efficient computation
times = group[createdAtMillisKey].values
raters = group[raterParticipantIdKey].values
tweetId = group[tweetIdKey].iloc[0] # Assuming tweetIdKey is constant within a note
n = len(group)
window_start = 0
for i in range(n):
# Move the window start forward if the time difference exceeds windowMillis
while times[i] - times[window_start] > windowMillis:
window_start += 1
# For all indices within the sliding window (excluding the current index)
for j in range(window_start, i):
if raters[i] != raters[j]:
left_rater, right_rater = tuple(sorted((raters[i], raters[j])))
left_raters.append(left_rater)
right_raters.append(right_rater)
tweet_ids.append(tweetId)
# Convert lists to numpy arrays for efficient DataFrame creation
left_raters = np.array(left_raters)
right_raters = np.array(right_raters)
tweet_ids = np.array(tweet_ids)
# Create the DataFrame from the collected data
df = pd.DataFrame({
'leftRaterId': left_raters,
'rightRaterId': right_raters,
'tweetId': tweet_ids,
})
# Drop duplicates
df = df.drop_duplicates()
# Group by leftRaterId and rightRaterId and count the number of occurrences
df = (
df.groupby(['leftRaterId', 'rightRaterId'], as_index=False)
.agg(pairRatings=('tweetId', 'count'))
)
return df