in sourcecode/scoring/post_selection_similarity_old.py [0:0]
def _get_pair_tuples(ratings, windowMillis):
tuples = []
ratings = ratings.sort_values([c.noteIdKey, c.createdAtMillisKey])
values = ratings[
[c.noteIdKey, c.createdAtMillisKey, c.raterParticipantIdKey, c.tweetIdKey]
].values
print(len(values))
for i in range(len(values)):
priorNote, priorTs, priorRater, priorTweet = values[i]
if i == 0 or i == 1000 or i == 100000 or i % 5000000 == 0:
print(f"i={i} len(tuples)={len(tuples)}")
j = i + 1
while j < len(values):
nextNote, nextTs, nextRater, nextTweet = values[j]
assert priorNote <= nextNote, (priorNote, nextNote)
if nextNote != priorNote:
break # break if we're onto a new note
assert priorTweet == nextTweet, (priorTweet, nextTweet) # tweet should be same
assert priorRater != nextRater, (priorRater, nextRater) # rater should be different
assert priorTs <= nextTs, (priorTs, nextTs)
if nextTs > (priorTs + windowMillis):
break # break if we're beyond the overlap window
leftRater, rigthRater = tuple(sorted((priorRater, nextRater)))
tuples.append((leftRater, rigthRater, priorTweet))
j += 1
return tuples