def _get_pair_counts_df_dict()

in sourcecode/scoring/post_selection_similarity_old.py [0:0]


def _get_pair_counts_df_dict(ratings, windowMillis):
    import numpy as np
    import pandas as pd
    from collections import defaultdict

    # Assign column keys to local variables for faster access
    noteIdKey = c.noteIdKey
    createdAtMillisKey = c.createdAtMillisKey
    raterParticipantIdKey = c.raterParticipantIdKey

    # Sort ratings by noteIdKey and createdAtMillisKey
    ratings_sorted = ratings.sort_values([noteIdKey, createdAtMillisKey])

    # Initialize a defaultdict to store counts of pairs
    pair_counts = defaultdict(int)

    # Group by noteIdKey to process each note individually
    grouped = ratings_sorted.groupby(noteIdKey, sort=False)

    for noteId, group in grouped:
        # Extract relevant columns as numpy arrays for efficient computation
        times = group[createdAtMillisKey].values
        raters = group[raterParticipantIdKey].values

        n = len(group)
        window_start = 0

        for i in range(n):
            # Move the window start forward if the time difference exceeds windowMillis
            while times[i] - times[window_start] > windowMillis:
                window_start += 1

            # For all indices within the sliding window (excluding the current index)
            for j in range(window_start, i):
                if raters[i] != raters[j]:
                    left_rater, right_rater = tuple(sorted((raters[i], raters[j])))
                    # Update the count for this pair
                    pair_counts[(left_rater, right_rater)] += 1

    # Convert the pair_counts dictionary to a DataFrame
    if pair_counts:
        pairs = np.array(list(pair_counts.keys()))
        counts = np.array(list(pair_counts.values()))
        df = pd.DataFrame({
            'leftRaterId': pairs[:, 0],
            'rightRaterId': pairs[:, 1],
            'pairRatings': counts
        })
    else:
        # Return an empty DataFrame with appropriate columns
        df = pd.DataFrame(columns=['leftRaterId', 'rightRaterId', 'pairRatings'])

    return df