in sourcecode/scoring/post_selection_similarity_old.py [0:0]
def _get_pair_counts(ratings: pd.DataFrame, windowMillis: int = 1000 * 60 * 30) -> Dict:
"""
Compute counts of unique posts that were co-rated within windowMillis millis of each other
by different users.
Returns dict: (raterId1, raterId2) => count.
"""
with c.time_block("Computing rating pair counts"):
counts = dict()
seen = set()
ratings = ratings.sort_values([c.noteIdKey, c.createdAtMillisKey])
values = ratings[
[c.noteIdKey, c.createdAtMillisKey, c.raterParticipantIdKey, c.tweetIdKey]
].values
logger.info(len(values))
for i in range(len(values)):
priorNote, priorTs, priorRater, priorTweet = values[i]
if i == 0 or i == 1000 or i == 100000 or i % 5000000 == 0:
logger.info(f"get_pair_counts i={i}")
j = i + 1
while j < len(values):
nextNote, nextTs, nextRater, nextTweet = values[j]
assert priorNote <= nextNote, (priorNote, nextNote)
if nextNote != priorNote:
break # break if we're onto a new note
assert priorTweet == nextTweet, (priorTweet, nextTweet) # tweet should be same
assert priorRater != nextRater, (priorRater, nextRater) # rater should be different
assert priorTs <= nextTs, (priorTs, nextTs)
if nextTs > (priorTs + windowMillis):
break # break if we're beyond windowMillis
raterPairKey = tuple(sorted((priorRater, nextRater)))
raterTweetPairKey = (raterPairKey, priorTweet)
if raterTweetPairKey in seen:
break # break if we already counted a match on this tweet
seen.add(raterTweetPairKey)
if raterPairKey not in counts:
counts[raterPairKey] = 0
counts[raterPairKey] += 1
j += 1
return counts