def _get_pair_ratings_df_optimized()

in sourcecode/scoring/post_selection_similarity_old.py [0:0]


def _get_pair_ratings_df_optimized(ratings, windowMillis):

    # Assign column keys to local variables for faster access
    noteIdKey = c.noteIdKey
    createdAtMillisKey = c.createdAtMillisKey
    raterParticipantIdKey = c.raterParticipantIdKey
    tweetIdKey = c.tweetIdKey

    # Sort ratings by noteIdKey and createdAtMillisKey
    ratings_sorted = ratings.sort_values([noteIdKey, createdAtMillisKey])

    # Initialize lists to collect data
    left_raters = []
    right_raters = []
    tweet_ids = []

    # Group by noteIdKey to process each note individually
    grouped = ratings_sorted.groupby(noteIdKey, sort=False)

    for noteId, group in grouped:
        # Extract relevant columns as numpy arrays for efficient computation
        times = group[createdAtMillisKey].values
        raters = group[raterParticipantIdKey].values
        tweetId = group[tweetIdKey].iloc[0]  # Assuming tweetIdKey is constant within a note

        n = len(group)
        window_start = 0

        for i in range(n):
            # Move the window start forward if the time difference exceeds windowMillis
            while times[i] - times[window_start] > windowMillis:
                window_start += 1

            # For all indices within the sliding window (excluding the current index)
            for j in range(window_start, i):
                if raters[i] != raters[j]:
                    left_rater, right_rater = tuple(sorted((raters[i], raters[j])))
                    left_raters.append(left_rater)
                    right_raters.append(right_rater)
                    tweet_ids.append(tweetId)

    # Convert lists to numpy arrays for efficient DataFrame creation
    left_raters = np.array(left_raters)
    right_raters = np.array(right_raters)
    tweet_ids = np.array(tweet_ids)

    # Create the DataFrame from the collected data
    df = pd.DataFrame({
        'leftRaterId': left_raters,
        'rightRaterId': right_raters,
        'tweetId': tweet_ids,
    })

    # Drop duplicates
    df = df.drop_duplicates()

    # Group by leftRaterId and rightRaterId and count the number of occurrences
    df = (
        df.groupby(['leftRaterId', 'rightRaterId'], as_index=False)
        .agg(pairRatings=('tweetId', 'count'))
    )
    return df