sourcecode/scoring/constants.py (827 lines of code) (raw):
from contextlib import contextmanager
from dataclasses import dataclass
from enum import Enum
import logging
import os
import time
from typing import Dict, Optional, Set
import numpy as np
import pandas as pd
logger = logging.getLogger("birdwatch.constants")
logger.setLevel(logging.INFO)
# Default number of threads to use in torch if os.cpu_count() is unavailable
# and no value is specified.
defaultNumThreads = os.cpu_count() or 8
# Store the timestamp at which the constants module is initialized. Note
# that module initialization occurs only once regardless of how many times
# the module is imported (see link below). Storing a designated timestamp
# as a constant allow us to:
# -Use a consistent notion of "now" throughout scorer execution.
# -Overwrite "now" when system testing to reduce spurious diffs.
#
# https://docs.python.org/3/tutorial/modules.html#more-on-modules
epochMillis = 1000 * time.time()
useCurrentTimeInsteadOfEpochMillisForNoteStatusHistory = True
# Use this size threshld to isolate code which should be run differently in small
# scale unit tests.
minNumNotesForProdData = 200
# Define limit on how old a note needs to be to lock
noteLockMillis = 14 * 24 * 60 * 60 * 1000
# Explanation Tags
minRatingsToGetTag = 2
minTagsNeededForStatus = 2
tagPercentileForNormalization = 40
intervalHalfWidth = 0.3
# Max flip rates
prescoringAllUnlockedNotesMaxCrhChurn = 0.2
prescoringAllNotesCreatedThreeToThirteenDaysAgoMaxChurn = 0.06
finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.05
finalNotesWithNewRatingsMaxNewCrhChurn = 0.80
finalNotesWithNewRatingsMaxOldCrhChurn = 0.25
finalNotesThatJustFlippedStatusMaxCrhChurn = 1e8
finalNotesThatFlippedRecentlyMaxCrhChurn = 1e8
# TODO(jiansongc): adjust these 2 below
finalNotesNmrDueToMinStableCrhTimeMaxOldCrhChurn = 1.0
finalNotesNmrDueToMinStableCrhTimeMaxNewCrhChurn = 1.0
# Data Filenames
scoredNotesOutputPath = "scoredNotes.tsv"
enrollmentInputPath = "userEnrollment-00000.tsv"
notesInputPath = "notes-00000.tsv"
ratingsInputPath = "ratings"
noteStatusHistoryInputPath = "noteStatusHistory-00000.tsv"
# TSV Column Names
participantIdKey = "participantId"
helpfulKey = "helpful"
notHelpfulKey = "notHelpful"
helpfulnessLevelKey = "helpfulnessLevel"
createdAtMillisKey = "createdAtMillis"
summaryKey = "summary"
noteTopicKey = "noteTopic"
authorTopNotHelpfulTagValues = "authorTopNotHelpfulTagValues"
modelingPopulationKey = "modelingPopulation"
modelingGroupKey = "modelingGroup"
modelingMultiGroupKey = "modelingMultiGroup"
numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
defaultIndexKey = "index"
# Scoring Groups
coreGroups: Set[int] = {1, 2, 3, 6, 8, 9, 10, 11, 13, 14, 19, 21, 25}
expansionGroups: Set[int] = {0, 4, 5, 7, 12, 15, 16, 18, 20, 22, 23, 26, 27, 28, 29}
expansionPlusGroups: Set[int] = {17, 24, 30, 31, 32}
# TSV Values
notHelpfulValueTsv = "NOT_HELPFUL"
somewhatHelpfulValueTsv = "SOMEWHAT_HELPFUL"
helpfulValueTsv = "HELPFUL"
notesSaysTweetIsMisleadingKey = "MISINFORMED_OR_POTENTIALLY_MISLEADING"
noteSaysTweetIsNotMisleadingKey = "NOT_MISLEADING"
# Fields Transformed From the Raw Data
helpfulNumKey = "helpfulNum"
ratingCreatedBeforeMostRecentNMRLabelKey = "ratingCreatedBeforeMostRecentNMRLabel"
ratingCreatedBeforePublicTSVReleasedKey = "ratingCreatedBeforePublicTSVReleased"
# Timestamps
deletedNoteTombstonesLaunchTime = 1652918400000 # May 19, 2022 UTC
notMisleadingUILaunchTime = 1664755200000 # October 3, 2022 UTC
lastRatingTagsChangeTimeMillis = 1639699200000 # 2021/12/15 UTC
publicTSVTimeDelay = 172800000 # 48 hours
# Explanation Tags
tagCountsKey = "tagCounts"
tiebreakOrderKey = "tiebreakOrder"
firstTagKey = "firstTag"
secondTagKey = "secondTag"
activeFilterTagsKey = "activeFilterTags"
# Contributor Counts
successfulRatingHelpfulCount = "successfulRatingHelpfulCount"
successfulRatingNotHelpfulCount = "successfulRatingNotHelpfulCount"
successfulRatingTotal = "successfulRatingTotal"
unsuccessfulRatingHelpfulCount = "unsuccessfulRatingHelpfulCount"
unsuccessfulRatingNotHelpfulCount = "unsuccessfulRatingNotHelpfulCount"
unsuccessfulRatingTotal = "unsuccessfulRatingTotal"
ratingsAwaitingMoreRatings = "ratingsAwaitingMoreRatings"
ratedAfterDecision = "ratedAfterDecision"
notesCurrentlyRatedHelpful = "notesCurrentlyRatedHelpful"
notesCurrentlyRatedNotHelpful = "notesCurrentlyRatedNotHelpful"
notesAwaitingMoreRatings = "notesAwaitingMoreRatings"
# Meta Scoring Columns
finalRatingStatusKey = "finalRatingStatus"
unlockedRatingStatusKey = "unlockedRatingStatus"
metaScorerActiveRulesKey = "metaScorerActiveRules"
decidedByKey = "decidedBy"
rescoringActiveRulesKey = "rescoringActiveRules"
# Note Status Changes Columns
noteFinalStatusChange = "finalStatusChange"
noteNewRatings = "newRatings"
noteDecidedByChange = "decidedByChange"
noteAllAddedRules = "allAddedRules"
noteAllRemovedRules = "allRemovedRules"
noteDecidedByInterceptChange = "decidedByInterceptChange"
# Internal Scoring Columns. These columns should be renamed before writing to disk.
internalNoteInterceptKey = "internalNoteIntercept"
internalRaterInterceptKey = "internalRaterIntercept"
internalNoteFactorKeyBase = "internalNoteFactor"
internalRaterFactorKeyBase = "internalRaterFactor"
internalRatingStatusKey = "internalRatingStatus"
internalActiveRulesKey = "internalActiveRules"
internalRaterReputationKey = "internalRaterReputation"
scorerNameKey = "scorerName"
def note_factor_key(i):
return internalNoteFactorKeyBase + str(i)
def rater_factor_key(i):
return internalRaterFactorKeyBase + str(i)
internalNoteFactor1Key = note_factor_key(1)
internalRaterFactor1Key = rater_factor_key(1)
# Output Scoring Columns.
# Core Model
coreNoteInterceptKey = "coreNoteIntercept"
coreNoteFactor1Key = "coreNoteFactor1"
coreRaterInterceptKey = "coreRaterIntercept"
coreRaterFactor1Key = "coreRaterFactor1"
coreRatingStatusKey = "coreRatingStatus"
coreActiveRulesKey = "coreActiveRules"
coreNoteInterceptMaxKey = "coreNoteInterceptMax"
coreNoteInterceptMinKey = "coreNoteInterceptMin"
coreNumFinalRoundRatingsKey = "coreNumFinalRoundRatings"
# Expansion Model
expansionNoteInterceptKey = "expansionNoteIntercept"
expansionNoteFactor1Key = "expansionNoteFactor1"
expansionRatingStatusKey = "expansionRatingStatus"
expansionNoteInterceptMaxKey = "expansionNoteInterceptMax"
expansionNoteInterceptMinKey = "expansionNoteInterceptMin"
expansionInternalActiveRulesKey = "expansionActiveRules"
expansionNumFinalRoundRatingsKey = "expansionNumFinalRoundRatings"
expansionRaterFactor1Key = "expansionRaterFactor1"
expansionRaterInterceptKey = "expansionRaterIntercept"
# ExpansionPlus Model
expansionPlusNoteInterceptKey = "expansionPlusNoteIntercept"
expansionPlusNoteFactor1Key = "expansionPlusNoteFactor1"
expansionPlusRatingStatusKey = "expansionPlusRatingStatus"
expansionPlusInternalActiveRulesKey = "expansionPlusActiveRules"
expansionPlusNumFinalRoundRatingsKey = "expansionPlusNumFinalRoundRatings"
expansionPlusRaterFactor1Key = "expansionPlusRaterFactor1"
expansionPlusRaterInterceptKey = "expansionPlusRaterIntercept"
# Coverage / Helpfulness Reputation Model
coverageNoteInterceptKey = "coverageNoteIntercept"
coverageNoteFactor1Key = "coverageNoteFactor1"
coverageRatingStatusKey = "coverageRatingStatus"
coverageNoteInterceptMaxKey = "coverageNoteInterceptMax"
coverageNoteInterceptMinKey = "coverageNoteInterceptMin"
raterHelpfulnessReputationKey = "raterHelpfulnessReputation"
# Group Model
groupNoteInterceptKey = "groupNoteIntercept"
groupNoteFactor1Key = "groupNoteFactor1"
groupRatingStatusKey = "groupRatingStatus"
groupNoteInterceptMaxKey = "groupNoteInterceptMax"
groupNoteInterceptMinKey = "groupNoteInterceptMin"
groupRaterInterceptKey = "groupRaterIntercept"
groupRaterFactor1Key = "groupRaterFactor1"
groupInternalActiveRulesKey = "groupActiveRules"
groupNumFinalRoundRatingsKey = "groupNumFinalRoundRatings"
# MultiGroup Model
multiGroupNoteInterceptKey = "multiGroupNoteIntercept"
multiGroupNoteFactor1Key = "multiGroupNoteFactor1"
multiGroupRatingStatusKey = "multiGroupRatingStatus"
multiGroupRaterInterceptKey = "multiGroupRaterIntercept"
multiGroupRaterFactor1Key = "multiGroupRaterFactor1"
multiGroupInternalActiveRulesKey = "multiGroupActiveRules"
multiGroupNumFinalRoundRatingsKey = "multiGroupNumFinalRoundRatings"
# Topic Model
topicNoteInterceptKey = "topicNoteIntercept"
topicNoteFactor1Key = "topicNoteFactor1"
topicRatingStatusKey = "topicRatingStatus"
topicNoteConfidentKey = "topicNoteConfident"
topicInternalActiveRulesKey = "topicActiveRules"
topicNumFinalRoundRatingsKey = "topicNumFinalRoundRatings"
# Harassment/Abuse Tag
harassmentNoteInterceptKey = "harassmentNoteIntercept"
harassmentNoteFactor1Key = "harassmentNoteFactor1"
harassmentRaterInterceptKey = "harassmentRaterIntercept"
harassmentRaterFactor1Key = "harassmentRaterFactor1"
# Ids and Indexes
noteIdKey = "noteId"
tweetIdKey = "tweetId"
classificationKey = "classification"
noteAuthorParticipantIdKey = "noteAuthorParticipantId"
raterParticipantIdKey = "raterParticipantId"
# Aggregations
noteCountKey = "noteCount"
ratingCountKey = "ratingCount"
numRatingsKey = "numRatings"
numRatingsLast28DaysKey = "numRatingsLast28"
ratingFromInitialModelingGroupKey = "ratingFromInitialModelingGroup"
percentFromInitialModelingGroupKey = "percentFromInitialModelingGroup"
numFinalRoundRatingsKey = "numFinalRoundRatings"
# Helpfulness Score Keys
crhRatioKey = "CRHRatio"
crnhRatioKey = "CRNHRatio"
crhCrnhRatioDifferenceKey = "crhCrnhRatioDifference"
meanNoteScoreKey = "meanNoteScore"
raterAgreeRatioKey = "raterAgreeRatio"
ratingAgreesWithNoteStatusKey = "ratingAgreesWithNoteStatus"
aboveHelpfulnessThresholdKey = "aboveHelpfulnessThreshold"
totalHelpfulHarassmentRatingsPenaltyKey = "totalHelpfulHarassmentPenalty"
raterAgreeRatioWithHarassmentAbusePenaltyKey = "raterAgreeRatioKeyWithHarassmentAbusePenalty"
# Note Status Labels
currentlyRatedHelpful = "CURRENTLY_RATED_HELPFUL"
currentlyRatedNotHelpful = "CURRENTLY_RATED_NOT_HELPFUL"
needsMoreRatings = "NEEDS_MORE_RATINGS"
# FIRM_REJECT is set by individual scorers to indicate downstream scorers should not CRH
# a note, but is never set as the finalRatingStatus of a note.
firmReject = "FIRM_REJECT"
# Boolean Note Status Labels
currentlyRatedHelpfulBoolKey = "crhBool"
currentlyRatedNotHelpfulBoolKey = "crnhBool"
awaitingMoreRatingsBoolKey = "awaitingBool"
helpfulOtherTagKey = "helpfulOther"
helpfulInformativeTagKey = "helpfulInformative"
helpfulClearTagKey = "helpfulClear"
helpfulEmpatheticTagKey = "helpfulEmpathetic"
helpfulGoodSourcesTagKey = "helpfulGoodSources"
helpfulUniqueContextTagKey = "helpfulUniqueContext"
helpfulAddressesClaimTagKey = "helpfulAddressesClaim"
helpfulImportantContextTagKey = "helpfulImportantContext"
helpfulUnbiasedLanguageTagKey = "helpfulUnbiasedLanguage"
helpfulTagsAndTieBreakOrder = [
(0, helpfulOtherTagKey),
(8, helpfulInformativeTagKey),
(7, helpfulClearTagKey),
(3, helpfulEmpatheticTagKey),
(4, helpfulGoodSourcesTagKey),
(2, helpfulUniqueContextTagKey),
(5, helpfulAddressesClaimTagKey),
(6, helpfulImportantContextTagKey),
(1, helpfulUnbiasedLanguageTagKey),
]
helpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in helpfulTagsAndTieBreakOrder]
helpfulTagBoolsAndTypesTSVOrder = [(tag, pd.Int8Dtype()) for tag in helpfulTagsTSVOrder]
helpfulTagsTiebreakOrder = [tag for (tiebreakOrder, tag) in sorted(helpfulTagsAndTieBreakOrder)]
helpfulTagCountsAndTypesTSVOrder = [(tag, pd.Int64Dtype()) for tag in helpfulTagsTSVOrder]
# NOTE: Always add new tags to the end of this list, and *never* change the order of
# elements which are already in the list to maintain compatibility with
# BirdwatchNoteNotHelpfulTags.get in Scala.
notHelpfulIncorrectTagKey = "notHelpfulIncorrect"
notHelpfulOtherTagKey = "notHelpfulOther"
notHelpfulSpamHarassmentOrAbuseTagKey = "notHelpfulSpamHarassmentOrAbuse"
notHelpfulArgumentativeOrBiasedTagKey = "notHelpfulArgumentativeOrBiased"
notHelpfulHardToUnderstandKey = "notHelpfulHardToUnderstand"
notHelpfulNoteNotNeededKey = "notHelpfulNoteNotNeeded"
notHelpfulSourcesMissingOrUnreliableTagKey = "notHelpfulSourcesMissingOrUnreliable"
notHelpfulIrrelevantSourcesTagKey = "notHelpfulIrrelevantSources"
notHelpfulOpinionSpeculationOrBiasTagKey = "notHelpfulOpinionSpeculationOrBias"
notHelpfulMissingKeyPointsTagKey = "notHelpfulMissingKeyPoints"
notHelpfulOutdatedTagKey = "notHelpfulOutdated"
notHelpfulOffTopicTagKey = "notHelpfulOffTopic"
notHelpfulOpinionSpeculationTagKey = "notHelpfulOpinionSpeculation"
## This list is in TSV Order, but with indices for tiebreak order.
notHelpfulTagsAndTieBreakOrder = [
(0, notHelpfulOtherTagKey), ## should lose all tiebreaks
(8, notHelpfulIncorrectTagKey),
(2, notHelpfulSourcesMissingOrUnreliableTagKey),
(4, notHelpfulOpinionSpeculationOrBiasTagKey),
(5, notHelpfulMissingKeyPointsTagKey),
(12, notHelpfulOutdatedTagKey), ## should win all tiebreaks
(10, notHelpfulHardToUnderstandKey),
(7, notHelpfulArgumentativeOrBiasedTagKey),
(9, notHelpfulOffTopicTagKey),
(11, notHelpfulSpamHarassmentOrAbuseTagKey),
(1, notHelpfulIrrelevantSourcesTagKey),
(3, notHelpfulOpinionSpeculationTagKey),
(6, notHelpfulNoteNotNeededKey),
]
notHelpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in notHelpfulTagsAndTieBreakOrder]
notHelpfulTagsAndTypesTSVOrder = [(tag, pd.Int8Dtype()) for tag in notHelpfulTagsTSVOrder]
notHelpfulTagCountsAndTypesTSVOrder = [(tag, pd.Int64Dtype()) for tag in notHelpfulTagsTSVOrder]
notHelpfulTagsTiebreakOrder = [
tag for (tiebreakOrder, tag) in sorted(notHelpfulTagsAndTieBreakOrder)
]
notHelpfulTagsTiebreakMapping = {
tag: priority for (priority, tag) in notHelpfulTagsAndTieBreakOrder
}
notHelpfulTagsEnumMapping = {
tag: idx for (idx, (_, tag)) in enumerate(notHelpfulTagsAndTieBreakOrder)
}
adjustedSuffix = "Adjusted"
notHelpfulTagsAdjustedColumns = [f"{column}{adjustedSuffix}" for column in notHelpfulTagsTSVOrder]
notHelpfulTagsAdjustedTSVColumnsAndTypes = [
(tag, np.double) for tag in notHelpfulTagsAdjustedColumns
]
ratioSuffix = "Ratio"
notHelpfulTagsAdjustedRatioColumns = [
f"{column}{ratioSuffix}" for column in notHelpfulTagsAdjustedColumns
]
notHelpfulTagsAdjustedRatioTSVColumnsAndTypes = [
(tag, np.double) for tag in notHelpfulTagsAdjustedRatioColumns
]
ratingWeightKey = "ratingWeight"
incorrectTagRatingsMadeByRaterKey = "incorrectTagRatingsMadeByRater"
totalRatingsMadeByRaterKey = "totalRatingsMadeByRater"
noteTfIdfIncorrectScoreKey = "tf_idf_incorrect"
numVotersKey = "num_voters" # num voters who rated a note
incorrectTagRateByRaterKey = "p_incorrect_user"
noteTfIdfIncorrectScoreIntervalKey = (
"tf_idf_incorrect_interval" # note's tf-idf scores from within the interval
)
numVotersIntervalKey = "num_voters_interval" # num voters (in the interval) who rated a note
sumOfIncorrectTagRateByRaterIntervalKey = (
"p_incorrect_user_interval"
) # sum of p_incorrect_user for all raters who rated a note in the interval
notHelpfulIncorrectIntervalKey = (
"notHelpfulIncorrect_interval" # notHelpfulIncorrect ratings on the note in the interval
)
lowDiligenceInterceptKey = "lowDiligenceIntercept"
lowDiligenceRaterFactor1Key = "lowDiligenceRaterFactor1"
lowDiligenceRaterInterceptKey = "lowDiligenceRaterIntercept"
lowDiligenceRaterReputationKey = "lowDiligenceRaterReputation"
lowDiligenceNoteFactor1Key = "lowDiligenceNoteFactor1"
lowDiligenceNoteInterceptKey = "lowDiligenceNoteIntercept"
lowDiligenceLegacyNoteInterceptKey = "lowDiligenceIntercept"
lowDiligenceNoteInterceptRound2Key = "lowDiligenceNoteInterceptRound2"
internalNoteInterceptRound2Key = "internalNoteInterceptRound2"
lowDiligenceRaterInterceptRound2Key = "lowDiligenceRaterInterceptRound2"
internalRaterInterceptRound2Key = "internalRaterInterceptRound2"
incorrectFilterColumnsAndTypes = [
(notHelpfulIncorrectIntervalKey, np.double),
(sumOfIncorrectTagRateByRaterIntervalKey, np.double),
(numVotersIntervalKey, np.double),
(noteTfIdfIncorrectScoreIntervalKey, np.double),
(lowDiligenceLegacyNoteInterceptKey, np.double),
]
incorrectFilterColumns = [col for (col, _) in incorrectFilterColumnsAndTypes]
misleadingOtherKey = "misleadingOther"
misleadingFactualErrorKey = "misleadingFactualError"
misleadingManipulatedMediaKey = "misleadingManipulatedMedia"
misleadingOutdatedInformationKey = "misleadingOutdatedInformation"
misleadingMissingImportantContextKey = "misleadingMissingImportantContext"
misleadingUnverifiedClaimAsFactKey = "misleadingUnverifiedClaimAsFact"
misleadingSatireKey = "misleadingSatire"
misleadingTags = [
misleadingOtherKey,
misleadingFactualErrorKey,
misleadingManipulatedMediaKey,
misleadingOutdatedInformationKey,
misleadingMissingImportantContextKey,
misleadingUnverifiedClaimAsFactKey,
misleadingSatireKey,
]
misleadingTagsAndTypes = [(tag, pd.Int8Dtype()) for tag in misleadingTags]
notMisleadingOtherKey = "notMisleadingOther"
notMisleadingFactuallyCorrectKey = "notMisleadingFactuallyCorrect"
notMisleadingOutdatedButNotWhenWrittenKey = "notMisleadingOutdatedButNotWhenWritten"
notMisleadingClearlySatireKey = "notMisleadingClearlySatire"
notMisleadingPersonalOpinionKey = "notMisleadingPersonalOpinion"
notMisleadingTags = [
notMisleadingOtherKey,
notMisleadingFactuallyCorrectKey,
notMisleadingOutdatedButNotWhenWrittenKey,
notMisleadingClearlySatireKey,
notMisleadingPersonalOpinionKey,
]
notMisleadingTagsAndTypes = [(tag, pd.Int8Dtype()) for tag in notMisleadingTags]
believableKey = "believable"
harmfulKey = "harmful"
validationDifficultyKey = "validationDifficulty"
trustworthySourcesKey = "trustworthySources"
isMediaNoteKey = "isMediaNote"
noteTSVColumnsAndTypes = (
[
(noteIdKey, np.int64),
(noteAuthorParticipantIdKey, object),
(createdAtMillisKey, np.int64),
(tweetIdKey, np.int64),
(classificationKey, object),
(believableKey, "category"),
(harmfulKey, "category"),
(validationDifficultyKey, "category"),
]
+ misleadingTagsAndTypes
+ notMisleadingTagsAndTypes
+ [
(trustworthySourcesKey, pd.Int8Dtype()),
(summaryKey, object),
(isMediaNoteKey, pd.Int8Dtype()),
]
)
noteTSVColumns = [col for (col, dtype) in noteTSVColumnsAndTypes]
noteTSVTypes = [dtype for (col, dtype) in noteTSVColumnsAndTypes]
noteTSVTypeMapping = {col: dtype for (col, dtype) in noteTSVColumnsAndTypes}
versionKey = "version"
agreeKey = "agree"
disagreeKey = "disagree"
ratedOnTweetIdKey = "ratedOnTweetId"
ratingTSVColumnsAndTypes = (
[
(noteIdKey, np.int64),
(raterParticipantIdKey, object),
(createdAtMillisKey, np.int64),
(versionKey, pd.Int8Dtype()),
(agreeKey, pd.Int8Dtype()),
(disagreeKey, pd.Int8Dtype()),
(helpfulKey, pd.Int8Dtype()),
(notHelpfulKey, pd.Int8Dtype()),
(helpfulnessLevelKey, "category"),
]
+ helpfulTagBoolsAndTypesTSVOrder
+ notHelpfulTagsAndTypesTSVOrder
+ [(ratedOnTweetIdKey, np.int64)]
)
ratingTSVColumns = [col for (col, dtype) in ratingTSVColumnsAndTypes]
ratingTSVTypes = [dtype for (col, dtype) in ratingTSVColumnsAndTypes]
ratingTSVTypeMapping = {col: dtype for (col, dtype) in ratingTSVColumnsAndTypes}
timestampMillisOfNoteFirstNonNMRLabelKey = "timestampMillisOfFirstNonNMRStatus"
firstNonNMRLabelKey = "firstNonNMRStatus"
timestampMillisOfNoteCurrentLabelKey = "timestampMillisOfCurrentStatus"
currentLabelKey = "currentStatus"
timestampMillisOfNoteMostRecentNonNMRLabelKey = "timestampMillisOfLatestNonNMRStatus"
mostRecentNonNMRLabelKey = "mostRecentNonNMRStatus"
timestampMillisOfStatusLockKey = "timestampMillisOfStatusLock"
lockedStatusKey = "lockedStatus"
timestampMillisOfRetroLockKey = "timestampMillisOfRetroLock"
currentCoreStatusKey = "currentCoreStatus"
currentExpansionStatusKey = "currentExpansionStatus"
currentGroupStatusKey = "currentGroupStatus"
currentDecidedByKey = "currentDecidedBy"
currentModelingGroupKey = "currentModelingGroup"
timestampMillisOfMostRecentStatusChangeKey = "timestampMillisOfMostRecentStatusChange"
currentMultiGroupStatusKey = "currentMultiGroupStatus"
currentModelingMultiGroupKey = "currentModelingMultiGroup"
timestampMillisOfNmrDueToMinStableCrhTimeKey = "timestampMillisOfNmrDueToMinStableCrhTime"
updatedTimestampMillisOfNmrDueToMinStableCrhTimeKey = (
"updatedTimestampMillisOfNmrDueToMinStableCrhTime"
)
timestampMinuteOfFinalScoringOutput = "timestampMinuteOfFinalScoringOutput"
timestampMillisOfFirstNmrDueToMinStableCrhTimeKey = "timestampMillisOfFirstNmrDueToMinStableCrhTime"
noteStatusHistoryTSVColumnsAndTypes = [
(noteIdKey, np.int64),
(noteAuthorParticipantIdKey, object),
(createdAtMillisKey, np.int64),
(timestampMillisOfNoteFirstNonNMRLabelKey, np.double), # double because nullable.
(firstNonNMRLabelKey, "category"),
(timestampMillisOfNoteCurrentLabelKey, np.double), # double because nullable.
(currentLabelKey, "category"),
(timestampMillisOfNoteMostRecentNonNMRLabelKey, np.double), # double because nullable.
(mostRecentNonNMRLabelKey, "category"),
(timestampMillisOfStatusLockKey, np.double), # double because nullable.
(lockedStatusKey, "category"),
(timestampMillisOfRetroLockKey, np.double), # double because nullable.
(currentCoreStatusKey, "category"),
(currentExpansionStatusKey, "category"),
(currentGroupStatusKey, "category"),
(currentDecidedByKey, "category"),
(currentModelingGroupKey, np.double), # TODO: int
(timestampMillisOfMostRecentStatusChangeKey, np.double), # double because nullable.
(timestampMillisOfNmrDueToMinStableCrhTimeKey, np.double), # double because nullable.
(currentMultiGroupStatusKey, "category"),
(currentModelingMultiGroupKey, np.double), # TODO: int
(timestampMinuteOfFinalScoringOutput, np.double), # double because nullable.
(timestampMillisOfFirstNmrDueToMinStableCrhTimeKey, np.double), # double because nullable.
]
noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
noteStatusHistoryTSVTypeMapping = {
col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes
}
# TODO(jiansongc): clean up after new column is in production.
noteStatusHistoryTSVColumnsOld = noteStatusHistoryTSVColumns[:-1]
noteStatusHistoryTSVColumnsAndTypesOld = noteStatusHistoryTSVColumnsAndTypes[:-1]
noteStatusHistoryTSVTypeMappingOld = {
col: dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypesOld
}
# Earn In + Earn Out
enrollmentState = "enrollmentState"
successfulRatingNeededToEarnIn = "successfulRatingNeededToEarnIn"
timestampOfLastStateChange = "timestampOfLastStateChange"
timestampOfLastEarnOut = "timestampOfLastEarnOut"
authorTopNotHelpfulTagValues = "authorTopNotHelpfulTagValues"
maxHistoryEarnOut = 5
successfulRatingHelpfulCount = "successfulRatingHelpfulCount"
earnedIn = "earnedIn"
atRisk = "atRisk"
earnedOutNoAcknowledge = "earnedOutNoAcknowledge"
earnedOutAcknowledged = "earnedOutAcknowledged"
newUser = "newUser"
removed = "removed"
isAtRiskCRNHCount = 2
ratingImpactForEarnIn = 5
ratingImpact = "ratingImpact"
enrollmentStateToThrift = {
earnedIn: 0,
atRisk: 1,
earnedOutNoAcknowledge: 2,
earnedOutAcknowledged: 3,
newUser: 4,
removed: 5,
}
emergingWriterDays = 28
isEmergingWriterKey = "isEmergingWriter"
emergingMeanNoteScore = 0.3
emergingRatingCount = 10
aggregateRatingReceivedTotal = "aggregateRatingReceivedTotal"
core = "CORE"
expansion = "EXPANSION"
expansionPlus = "EXPANSION_PLUS"
topWriterWritingImpact = 10
topWriterHitRate = 0.04
hasCrnhSinceEarnOut = "hasCrnhSinceEarnOut"
userEnrollmentTSVColumnsAndTypes = [
(participantIdKey, str),
(enrollmentState, str),
(successfulRatingNeededToEarnIn, np.int64),
(timestampOfLastStateChange, np.int64),
(timestampOfLastEarnOut, np.double), # double because nullable.
(modelingPopulationKey, "category"),
(modelingGroupKey, np.float64),
(numberOfTimesEarnedOutKey, np.int64),
]
userEnrollmentTSVColumns = [col for (col, _) in userEnrollmentTSVColumnsAndTypes]
userEnrollmentTSVTypes = [dtype for (_, dtype) in userEnrollmentTSVColumnsAndTypes]
userEnrollmentTSVTypeMapping = {col: dtype for (col, dtype) in userEnrollmentTSVColumnsAndTypes}
noteInterceptMaxKey = "internalNoteIntercept_max"
noteInterceptMinKey = "internalNoteIntercept_min"
noteParameterUncertaintyTSVMainColumnsAndTypes = [
(noteInterceptMaxKey, np.double),
(noteInterceptMinKey, np.double),
]
noteParameterUncertaintyTSVAuxColumnsAndTypes = [
("internalNoteFactor1_max", np.double),
("internalNoteFactor1_median", np.double),
("internalNoteFactor1_min", np.double),
("internalNoteFactor1_refit_orig", np.double),
("internalNoteIntercept_median", np.double),
("internalNoteIntercept_refit_orig", np.double),
("ratingCount_all", np.int64),
("ratingCount_neg_fac", np.int64),
("ratingCount_pos_fac", np.int64),
]
noteParameterUncertaintyTSVColumnsAndTypes = (
noteParameterUncertaintyTSVAuxColumnsAndTypes + noteParameterUncertaintyTSVMainColumnsAndTypes
)
noteParameterUncertaintyTSVColumns = [
col for (col, _) in noteParameterUncertaintyTSVColumnsAndTypes
]
noteParameterUncertaintyTSVAuxColumns = [
col for (col, _) in noteParameterUncertaintyTSVAuxColumnsAndTypes
]
noteParameterUncertaintyTSVMainColumns = [
col for (col, _) in noteParameterUncertaintyTSVMainColumnsAndTypes
]
noteParameterUncertaintyTSVTypes = [
dtype for (_, dtype) in noteParameterUncertaintyTSVColumnsAndTypes
]
noteParameterUncertaintyTSVTypeMapping = {
col: dtype for (col, dtype) in noteParameterUncertaintyTSVColumnsAndTypes
}
auxiliaryScoredNotesTSVColumnsAndTypes = (
[
(noteIdKey, np.int64),
(ratingWeightKey, np.double),
(createdAtMillisKey, np.int64),
(noteAuthorParticipantIdKey, object),
(awaitingMoreRatingsBoolKey, np.int8),
(numRatingsLast28DaysKey, np.int64),
(currentLabelKey, str),
(currentlyRatedHelpfulBoolKey, np.int8),
(currentlyRatedNotHelpfulBoolKey, np.int8),
(unlockedRatingStatusKey, str),
]
+ helpfulTagCountsAndTypesTSVOrder
+ notHelpfulTagCountsAndTypesTSVOrder
+ notHelpfulTagsAdjustedTSVColumnsAndTypes
+ notHelpfulTagsAdjustedRatioTSVColumnsAndTypes
+ incorrectFilterColumnsAndTypes
)
auxiliaryScoredNotesTSVColumns = [col for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes]
auxiliaryScoredNotesTSVTypeMapping = {
col: dtype for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes
}
deprecatedNoteModelOutputColumns = frozenset(
{
coverageNoteInterceptMinKey,
coverageNoteInterceptMaxKey,
groupNoteInterceptMinKey,
groupNoteInterceptMaxKey,
}
)
prescoringNoteModelOutputTSVColumnsAndTypes = [
(noteIdKey, np.int64),
(internalNoteInterceptKey, np.double),
(internalNoteFactor1Key, np.double),
(scorerNameKey, str),
(lowDiligenceNoteInterceptKey, np.double),
(lowDiligenceNoteFactor1Key, np.double),
(lowDiligenceNoteInterceptRound2Key, np.double),
]
prescoringNoteModelOutputTSVColumns = [
col for (col, dtype) in prescoringNoteModelOutputTSVColumnsAndTypes
]
prescoringNoteModelOutputTSVTypeMapping = {
col: dtype for (col, dtype) in prescoringNoteModelOutputTSVColumnsAndTypes
}
noteModelOutputTSVColumnsAndTypes = [
(noteIdKey, np.int64),
(coreNoteInterceptKey, np.double),
(coreNoteFactor1Key, np.double),
(finalRatingStatusKey, "category"),
(firstTagKey, "category"),
(secondTagKey, "category"),
# Note that this column was formerly named "activeRules" and the name is now
# updated to "coreActiveRules". The data values remain the compatible,
# but the new column only contains rules that ran when deciding status based on
# the core model.
(coreActiveRulesKey, "category"),
(activeFilterTagsKey, "category"),
(classificationKey, "category"),
(createdAtMillisKey, np.int64),
(coreRatingStatusKey, "category"),
(metaScorerActiveRulesKey, "category"),
(decidedByKey, "category"),
(expansionNoteInterceptKey, np.double),
(expansionNoteFactor1Key, np.double),
(expansionRatingStatusKey, "category"),
(coverageNoteInterceptKey, np.double),
(coverageNoteFactor1Key, np.double),
(coverageRatingStatusKey, "category"),
(coreNoteInterceptMinKey, np.double),
(coreNoteInterceptMaxKey, np.double),
(expansionNoteInterceptMinKey, "category"), # category because always nan
(expansionNoteInterceptMaxKey, "category"), # category because always nan
(coverageNoteInterceptMinKey, "category"), # category because always nan
(coverageNoteInterceptMaxKey, "category"), # category because always nan
(groupNoteInterceptKey, np.double),
(groupNoteFactor1Key, np.double),
(groupRatingStatusKey, "category"),
(groupNoteInterceptMaxKey, "category"), # category because always nan
(groupNoteInterceptMinKey, "category"), # category because always nan
(modelingGroupKey, np.float64),
(numRatingsKey, np.int64),
(timestampMillisOfNoteCurrentLabelKey, np.double),
(expansionPlusNoteInterceptKey, np.double),
(expansionPlusNoteFactor1Key, np.double),
(expansionPlusRatingStatusKey, "category"),
(topicNoteInterceptKey, np.double),
(topicNoteFactor1Key, np.double),
(topicRatingStatusKey, "category"),
(noteTopicKey, "category"),
(topicNoteConfidentKey, pd.BooleanDtype()),
(expansionInternalActiveRulesKey, "category"),
(expansionPlusInternalActiveRulesKey, "category"),
(groupInternalActiveRulesKey, "category"),
(topicInternalActiveRulesKey, "category"),
(coreNumFinalRoundRatingsKey, np.double), # double because nullable.
(expansionNumFinalRoundRatingsKey, np.double), # double because nullable.
(expansionPlusNumFinalRoundRatingsKey, np.double), # double because nullable.
(groupNumFinalRoundRatingsKey, np.double), # double because nullable.
(topicNumFinalRoundRatingsKey, np.double), # double because nullable.
(rescoringActiveRulesKey, "category"),
(multiGroupNoteInterceptKey, np.double),
(multiGroupNoteFactor1Key, np.double),
(multiGroupRatingStatusKey, str),
(modelingMultiGroupKey, np.float64),
(multiGroupInternalActiveRulesKey, str),
(multiGroupNumFinalRoundRatingsKey, np.double), # double because nullable.
]
noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}
deprecatedNoteModelOutputTSVColumnsAndTypes = [
(col, dtype)
for (col, dtype) in noteModelOutputTSVColumnsAndTypes
if col in deprecatedNoteModelOutputColumns
]
postSelectionValueKey = "postSelectionValue"
prescoringRaterModelOutputTSVColumnsAndTypes = [
(raterParticipantIdKey, object),
(internalRaterInterceptKey, np.double),
(internalRaterFactor1Key, np.double),
(crhCrnhRatioDifferenceKey, np.double),
(meanNoteScoreKey, np.double),
(raterAgreeRatioKey, np.double),
(aboveHelpfulnessThresholdKey, pd.BooleanDtype()),
(scorerNameKey, str),
(internalRaterReputationKey, np.double),
(lowDiligenceRaterInterceptKey, np.double),
(lowDiligenceRaterFactor1Key, np.double),
(lowDiligenceRaterReputationKey, np.double),
(lowDiligenceRaterInterceptRound2Key, np.double),
(incorrectTagRatingsMadeByRaterKey, pd.Int64Dtype()),
(totalRatingsMadeByRaterKey, pd.Int64Dtype()),
(postSelectionValueKey, pd.Int64Dtype()),
]
prescoringRaterModelOutputTSVColumns = [
col for (col, dtype) in prescoringRaterModelOutputTSVColumnsAndTypes
]
prescoringRaterModelOutputTSVTypeMapping = {
col: dtype for (col, dtype) in prescoringRaterModelOutputTSVColumnsAndTypes
}
raterModelOutputTSVColumnsAndTypes = [
(raterParticipantIdKey, np.int64),
(coreRaterInterceptKey, np.double),
(coreRaterFactor1Key, np.double),
(crhCrnhRatioDifferenceKey, np.double),
(meanNoteScoreKey, np.double),
(raterAgreeRatioKey, np.double),
(successfulRatingHelpfulCount, pd.Int64Dtype()),
(successfulRatingNotHelpfulCount, pd.Int64Dtype()),
(successfulRatingTotal, pd.Int64Dtype()),
(unsuccessfulRatingHelpfulCount, pd.Int64Dtype()),
(unsuccessfulRatingNotHelpfulCount, pd.Int64Dtype()),
(unsuccessfulRatingTotal, pd.Int64Dtype()),
(ratingsAwaitingMoreRatings, pd.Int64Dtype()),
(ratedAfterDecision, pd.Int64Dtype()),
(notesCurrentlyRatedHelpful, pd.Int64Dtype()),
(notesCurrentlyRatedNotHelpful, pd.Int64Dtype()),
(notesAwaitingMoreRatings, pd.Int64Dtype()),
(enrollmentState, pd.Int64Dtype()),
(successfulRatingNeededToEarnIn, pd.Int64Dtype()),
(authorTopNotHelpfulTagValues, str),
(timestampOfLastStateChange, np.double),
(aboveHelpfulnessThresholdKey, np.float64), # nullable bool.
(isEmergingWriterKey, pd.BooleanDtype()),
(aggregateRatingReceivedTotal, pd.Int64Dtype()),
(timestampOfLastEarnOut, np.double),
(groupRaterInterceptKey, np.double),
(groupRaterFactor1Key, np.double),
(modelingGroupKey, np.float64),
(raterHelpfulnessReputationKey, np.double),
(numberOfTimesEarnedOutKey, np.float64),
(expansionRaterInterceptKey, np.double),
(expansionRaterFactor1Key, np.double),
(expansionPlusRaterInterceptKey, np.double),
(expansionPlusRaterFactor1Key, np.double),
(multiGroupRaterInterceptKey, np.double),
(multiGroupRaterFactor1Key, np.double),
(modelingMultiGroupKey, np.float64),
]
raterModelOutputTSVColumns = [col for (col, dtype) in raterModelOutputTSVColumnsAndTypes]
raterModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in raterModelOutputTSVColumnsAndTypes}
noteStatusChangesPrev = "_prev"
noteStatusChangesDerivedColumnsAndTypes = [
(noteIdKey, np.int64),
(noteFinalStatusChange, str),
(noteNewRatings, np.int64),
(noteDecidedByChange, str),
(noteAllAddedRules, str),
(noteAllRemovedRules, str),
(noteDecidedByInterceptChange, str),
]
noteStatusChangesRemovedCols = [
col
for col in noteModelOutputTSVColumns
if ("NoteInterceptMin" in col) or ("NoteInterceptMax" in col)
]
noteStatusChangesModelOutputColumnsAndTypes = [
(col, t)
for (col, t) in noteModelOutputTSVColumnsAndTypes
if col not in noteStatusChangesRemovedCols + [noteIdKey]
]
noteStatusChangesModelOutputWithPreviousColumnsAndTypes = (
noteStatusChangesModelOutputColumnsAndTypes
+ [(col + noteStatusChangesPrev, t) for (col, t) in noteStatusChangesModelOutputColumnsAndTypes]
)
noteStatusChangeTSVColumnsAndTypes = noteStatusChangesDerivedColumnsAndTypes + sorted(
noteStatusChangesModelOutputWithPreviousColumnsAndTypes, key=lambda tup: tup[0]
)
noteStatusChangesTSVColumns = [col for (col, dtype) in noteStatusChangeTSVColumnsAndTypes]
noteStatusChangesTSVTypeMapping = {
col: dtype for (col, dtype) in noteStatusChangeTSVColumnsAndTypes
}
datasetKeyKey = "datasetKey"
partitionToReadKey = "partitionToRead"
fileNameToReadKey = "fileNameToRead"
inputPathsTSVColumnsAndTypes = [
(datasetKeyKey, str),
(partitionToReadKey, str),
(fileNameToReadKey, str),
]
inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}
@contextmanager
def time_block(label):
start = time.time()
try:
yield
finally:
end = time.time()
logger.info(f"{label} elapsed time: {end - start:.2f} secs ({((end - start) / 60.0):.2f} mins)")
### TODO: weave through second round intercept.
@dataclass
class ReputationGlobalIntercept:
firstRound: float
secondRound: float
finalRound: float
@dataclass
class PrescoringMetaScorerOutput:
globalIntercept: Optional[float]
lowDiligenceGlobalIntercept: Optional[ReputationGlobalIntercept]
tagFilteringThresholds: Optional[Dict[str, float]] # tag => threshold
finalRoundNumRatings: Optional[int]
finalRoundNumNotes: Optional[int]
finalRoundNumUsers: Optional[int]
@dataclass
class PrescoringMetaOutput:
metaScorerOutput: Dict[str, PrescoringMetaScorerOutput] # scorerName => output
@dataclass
class SharedMemoryDataframeInfo:
sharedMemoryName: str
dataSize: int
@dataclass
class ScoringArgsSharedMemory:
noteTopics: SharedMemoryDataframeInfo
ratings: SharedMemoryDataframeInfo
noteStatusHistory: SharedMemoryDataframeInfo
userEnrollment: SharedMemoryDataframeInfo
@dataclass
class PrescoringArgsSharedMemory(ScoringArgsSharedMemory):
pass
@dataclass
class FinalScoringArgsSharedMemory(ScoringArgsSharedMemory):
prescoringNoteModelOutput: SharedMemoryDataframeInfo
prescoringRaterModelOutput: SharedMemoryDataframeInfo
@dataclass
class ScoringArgs:
noteTopics: pd.DataFrame
ratings: pd.DataFrame
noteStatusHistory: pd.DataFrame
userEnrollment: pd.DataFrame
def remove_large_args_for_multiprocessing(self):
self.noteTopics = None
self.ratings = None
self.noteStatusHistory = None
self.userEnrollment = None
@dataclass
class PrescoringArgs(ScoringArgs):
pass
@dataclass
class FinalScoringArgs(ScoringArgs):
prescoringNoteModelOutput: pd.DataFrame
prescoringRaterModelOutput: pd.DataFrame
prescoringMetaOutput: PrescoringMetaOutput
def remove_large_args_for_multiprocessing(self):
self.ratings = None
self.noteStatusHistory = None
self.userEnrollment = None
self.prescoringNoteModelOutput = None
self.prescoringRaterModelOutput = None
@dataclass
class ModelResult:
scoredNotes: pd.DataFrame
helpfulnessScores: pd.DataFrame
auxiliaryNoteInfo: pd.DataFrame
scorerName: Optional[str]
metaScores: Optional[PrescoringMetaScorerOutput]
class RescoringRuleID(Enum):
ALL_NOTES = 1
NOTES_WITH_NEW_RATINGS = 2
NOTES_FLIPPED_PREVIOUS_RUN = 3
NEW_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 4
RECENTLY_FLIPPED_NOTES_NOT_RESCORED_RECENTLY_ENOUGH = 5
NMR_DUE_TO_MIN_STABLE_CRH_TIME = 6
NOTES_CREATED_SOMEWHAT_RECENTLY = 7
LOCKING_ELIGIBLE_RECENT_UNLOCKED_NOTES = 8
@dataclass
class NoteSubset:
noteSet: Optional[set]
maxNewCrhChurnRate: float
maxOldCrhChurnRate: float
description: RescoringRuleID