in src/java/com/twitter/search/earlybird/search/relevance/scoring/FeatureBasedScoringFunction.java [205:454]
protected LinearScoringData updateLinearScoringData(float luceneQueryScore) throws IOException {
// Reset the data for each tweet!!!
LinearScoringData data = new LinearScoringData();
docIdToScoringData.put(getCurrentDocID(), data);
// Set proper version for engagement counters for this request.
data.skipReason = SkipReason.NOT_SKIPPED;
data.luceneScore = luceneQueryScore;
data.userRep = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION);
if (antiGamingFilter != null && !antiGamingFilter.accept(getCurrentDocID())) {
data.skipReason = SkipReason.ANTIGAMING;
return data;
}
data.textScore = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.TEXT_SCORE);
data.tokenAt140DividedByNumTokensBucket = VISIBLE_TOKEN_RATIO_NORMALIZER.denormalize(
(byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO));
data.fromUserId = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF);
data.isFollow = followFilter != null
&& followFilter.contains(Longs.toByteArray(data.fromUserId));
data.isTrusted = trustedFilter != null
&& trustedFilter.contains(Longs.toByteArray(data.fromUserId));
data.isFromVerifiedAccount = documentFeatures.isFlagSet(
EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG);
data.isFromBlueVerifiedAccount = documentFeatures.isFlagSet(
EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG);
data.isSelfTweet = data.fromUserId == params.searcherId;
// v1 engagement counters, note that the first three values are post-log2 version
// of the original unnormalized values.
data.retweetCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.RETWEET_COUNT);
data.replyCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.REPLY_COUNT);
data.favCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.FAVORITE_COUNT);
data.embedsImpressionCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT);
data.embedsUrlCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_URL_COUNT);
data.videoViewCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.VIDEO_VIEW_COUNT);
// v2 engagement counters
data.retweetCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.RETWEET_COUNT_V2);
data.replyCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.REPLY_COUNT_V2);
data.favCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.FAVORITE_COUNT_V2);
// other v2 engagement counters
data.embedsImpressionCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2);
data.embedsUrlCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2);
data.videoViewCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2);
// pure v2 engagement counters without v1 counterpart
data.quotedCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.QUOTE_COUNT);
data.weightedRetweetCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT);
data.weightedReplyCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT);
data.weightedFavCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT);
data.weightedQuoteCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT);
Double querySpecificScoreAdjustment = params.querySpecificScoreAdjustments == null ? null
: params.querySpecificScoreAdjustments.get(tweetIDMapper.getTweetID(getCurrentDocID()));
data.querySpecificScore =
querySpecificScoreAdjustment == null ? 0.0 : querySpecificScoreAdjustment;
data.authorSpecificScore = params.authorSpecificScoreAdjustments == null
? 0.0
: params.authorSpecificScoreAdjustments.getOrDefault(data.fromUserId, 0.0);
// respect social filter type
if (params.socialFilterType != null && !data.isSelfTweet) {
if ((params.socialFilterType == ThriftSocialFilterType.ALL
&& !data.isFollow && !data.isTrusted)
|| (params.socialFilterType == ThriftSocialFilterType.TRUSTED && !data.isTrusted)
|| (params.socialFilterType == ThriftSocialFilterType.FOLLOWS && !data.isFollow)) {
// we can skip this hit as we only want social results in this mode.
data.skipReason = SkipReason.SOCIAL_FILTER;
return data;
}
}
// 1. first apply all the filters to only non-follow tweets and non-verified accounts,
// but be tender to sentinel values
// unless you specifically asked to apply filters regardless
if (params.applyFiltersAlways
|| (!data.isSelfTweet && !data.isFollow && !data.isFromVerifiedAccount
&& !data.isFromBlueVerifiedAccount)) {
if (data.userRep < params.reputationMinVal
// don't filter unset userreps, we give them the benefit of doubt and let it
// continue to scoring. userrep is unset when either user just signed up or
// during ingestion time we had trouble getting userrep from reputation service.
&& data.userRep != RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
data.skipReason = SkipReason.LOW_REPUTATION;
return data;
} else if (data.textScore < params.textScoreMinVal
// don't filter unset text scores, use goodwill value
&& data.textScore != RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL) {
data.skipReason = SkipReason.LOW_TEXT_SCORE;
return data;
} else if (data.retweetCountPostLog2 != LinearScoringData.UNSET_SIGNAL_VALUE
&& data.retweetCountPostLog2 < params.retweetMinVal) {
data.skipReason = SkipReason.LOW_RETWEET_COUNT;
return data;
} else if (data.favCountPostLog2 != LinearScoringData.UNSET_SIGNAL_VALUE
&& data.favCountPostLog2 < params.favMinVal) {
data.skipReason = SkipReason.LOW_FAV_COUNT;
return data;
}
}
// if sentinel value is set, assume goodwill score and let scoring continue.
if (data.textScore == RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL) {
data.textScore = RelevanceSignalConstants.GOODWILL_TEXT_SCORE;
}
if (data.userRep == RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
data.userRep = RelevanceSignalConstants.GOODWILL_REPUTATION;
}
data.tweetAgeInSeconds = now - timeMapper.getTime(getCurrentDocID());
if (data.tweetAgeInSeconds < 0) {
data.tweetAgeInSeconds = 0; // Age cannot be negative
}
// The PARUS_SCORE feature should be read as is.
data.parusScore = documentFeatures.getFeatureValue(EarlybirdFieldConstant.PARUS_SCORE);
data.isNullcast = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG);
data.hasUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_LINK_FLAG);
data.hasImageUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
data.hasVideoUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG);
data.hasNewsUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG);
data.isReply = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_REPLY_FLAG);
data.isRetweet = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_RETWEET_FLAG);
data.isOffensive = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG);
data.hasTrend = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_TREND_FLAG);
data.hasMultipleHashtagsOrTrends =
documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG);
data.isUserSpam = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_SPAM_FLAG);
data.isUserNSFW = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_NSFW_FLAG)
|| userTable.isSet(data.fromUserId, UserTable.NSFW_BIT);
data.isUserAntiSocial =
userTable.isSet(data.fromUserId, UserTable.ANTISOCIAL_BIT);
data.isUserBot = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_BOT_FLAG);
data.hasCard = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_CARD_FLAG);
data.cardType = SearchCardType.UNKNOWN.getByteValue();
if (data.hasCard) {
data.cardType =
(byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD);
}
data.hasVisibleLink = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG);
data.hasConsumerVideo =
documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG);
data.hasProVideo = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG);
data.hasVine = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VINE_FLAG);
data.hasPeriscope = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_PERISCOPE_FLAG);
data.hasNativeImage = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG);
data.hasQuote = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_QUOTE_FLAG);
data.isComposerSourceCamera =
documentFeatures.isFlagSet(EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG);
// Only read the shared status if the isRetweet or isReply bit is true (minor optimization).
if (data.isRetweet || (params.getInReplyToStatusId && data.isReply)) {
data.sharedStatusId =
documentFeatures.getFeatureValue(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF);
}
// Only read the reference tweet author ID if the isRetweet or isReply bit
// is true (minor optimization).
if (data.isRetweet || data.isReply) {
// the REFERENCE_AUTHOR_ID_CSF stores the source tweet author id for all retweets
long referenceAuthorId =
documentFeatures.getFeatureValue(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF);
if (referenceAuthorId > 0) {
data.referenceAuthorId = referenceAuthorId;
} else {
// we also store the reference author id for retweets, directed at tweets, and self threaded
// tweets separately on Realtime/Protected Earlybirds. This data will be moved to the
// REFERENCE_AUTHOR_ID_CSF and these fields will be deprecated in SEARCH-34958.
referenceAuthorId = LongIntConverter.convertTwoIntToOneLong(
(int) documentFeatures.getFeatureValue(
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT),
(int) documentFeatures.getFeatureValue(
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT));
if (referenceAuthorId > 0) {
data.referenceAuthorId = referenceAuthorId;
}
}
}
// Convert language to a thrift language and then back to an int in order to
// ensure a value compatible with our current ThriftLanguage definition.
ThriftLanguage tweetLang = ThriftLanguageUtil.safeFindByValue(
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE));
data.tweetLangId = tweetLang.getValue();
// Set the language-related features here so that they can be later used in promotion/demotion
// and also be transferred to ThriftSearchResultMetadata
data.userLangMult = computeUserLangMultiplier(data, params);
data.hasDifferentLang = params.uiLangId != ThriftLanguage.UNKNOWN.getValue()
&& params.uiLangId != data.tweetLangId;
data.hasEnglishTweetAndDifferentUILang = data.hasDifferentLang
&& data.tweetLangId == ThriftLanguage.ENGLISH.getValue();
data.hasEnglishUIAndDifferentTweetLang = data.hasDifferentLang
&& params.uiLangId == ThriftLanguage.ENGLISH.getValue();
// Exposed all these features for the clients.
data.isSensitiveContent =
documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT);
data.hasMultipleMediaFlag =
documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG);
data.profileIsEggFlag = documentFeatures.isFlagSet(EarlybirdFieldConstant.PROFILE_IS_EGG_FLAG);
data.isUserNewFlag = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_NEW_FLAG);
data.numMentions = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.NUM_MENTIONS);
data.numHashtags = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.NUM_HASHTAGS);
data.linkLanguage =
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LINK_LANGUAGE);
data.prevUserTweetEngagement =
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.PREV_USER_TWEET_ENGAGEMENT);
// health model scores by HML
data.toxicityScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.TOXICITY_SCORE);
data.pBlockScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.PBLOCK_SCORE);
data.pSpammyTweetScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.P_SPAMMY_TWEET_SCORE);
data.pReportedTweetScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.P_REPORTED_TWEET_SCORE);
data.spammyTweetContentScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.SPAMMY_TWEET_CONTENT_SCORE
);
data.experimentalHealthModelScore1 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_1);
data.experimentalHealthModelScore2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_2);
data.experimentalHealthModelScore3 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_3);
data.experimentalHealthModelScore4 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_4);
return data;
}