def _get_pair_counts()

in sourcecode/scoring/post_selection_similarity_old.py [0:0]


def _get_pair_counts(ratings: pd.DataFrame, windowMillis: int = 1000 * 60 * 30) -> Dict:
  """
  Compute counts of unique posts that were co-rated within windowMillis millis of each other
  by different users.

  Returns dict: (raterId1, raterId2) => count.
  """
  with c.time_block("Computing rating pair counts"):
    counts = dict()
    seen = set()
    ratings = ratings.sort_values([c.noteIdKey, c.createdAtMillisKey])
    values = ratings[
      [c.noteIdKey, c.createdAtMillisKey, c.raterParticipantIdKey, c.tweetIdKey]
    ].values
    logger.info(len(values))
    for i in range(len(values)):
      priorNote, priorTs, priorRater, priorTweet = values[i]
      if i == 0 or i == 1000 or i == 100000 or i % 5000000 == 0:
        logger.info(f"get_pair_counts i={i}")
      j = i + 1
      while j < len(values):
        nextNote, nextTs, nextRater, nextTweet = values[j]
        assert priorNote <= nextNote, (priorNote, nextNote)
        if nextNote != priorNote:
          break  # break if we're onto a new note
        assert priorTweet == nextTweet, (priorTweet, nextTweet)  # tweet should be same
        assert priorRater != nextRater, (priorRater, nextRater)  # rater should be different
        assert priorTs <= nextTs, (priorTs, nextTs)
        if nextTs > (priorTs + windowMillis):
          break  # break if we're beyond windowMillis
        raterPairKey = tuple(sorted((priorRater, nextRater)))
        raterTweetPairKey = (raterPairKey, priorTweet)
        if raterTweetPairKey in seen:
          break  # break if we already counted a match on this tweet
        seen.add(raterTweetPairKey)
        if raterPairKey not in counts:
          counts[raterPairKey] = 0
        counts[raterPairKey] += 1
        j += 1
    return counts