def _get_pair_counts_dict()

in sourcecode/scoring/post_selection_similarity.py [0:0]


def _get_pair_counts_dict(ratings, windowMillis):
  pair_counts = dict()

  # Group by tweetIdKey to process each tweet individually
  grouped_by_tweet = ratings.groupby(c.tweetIdKey, sort=False)

  for _, tweet_group in grouped_by_tweet:
    # Keep track of pairs we've already counted for this tweetId
    pairs_counted_in_tweet = set()

    # Group by noteIdKey within the tweet
    grouped_by_note = tweet_group.groupby(c.noteIdKey, sort=False)

    for _, note_group in grouped_by_note:
      note_group.sort_values(c.createdAtMillisKey, inplace=True)

      # Extract relevant columns as numpy arrays for efficient computation
      times = note_group[c.createdAtMillisKey].values
      raters = note_group[c.raterParticipantIdKey].values

      n = len(note_group)
      window_start = 0

      for i in range(n):
        # Move the window start forward if the time difference exceeds windowMillis
        while times[i] - times[window_start] > windowMillis:
          window_start += 1

        # For all indices within the sliding window (excluding the current index)
        for j in range(window_start, i):
          if raters[i] != raters[j]:
            left_rater, right_rater = tuple(sorted((raters[i], raters[j])))
            pair = (left_rater, right_rater)
            # Only count this pair once per tweetId
            if pair not in pairs_counted_in_tweet:
              pairs_counted_in_tweet.add(pair)
              # Update the count for this pair
              if pair not in pair_counts:
                pair_counts[pair] = 0
              pair_counts[pair] += 1

  return pair_counts