protected LinearScoringData updateLinearScoringData()

in src/java/com/twitter/search/earlybird/search/relevance/scoring/FeatureBasedScoringFunction.java [205:454]


  protected LinearScoringData updateLinearScoringData(float luceneQueryScore) throws IOException {
    // Reset the data for each tweet!!!
    LinearScoringData data = new LinearScoringData();
    docIdToScoringData.put(getCurrentDocID(), data);

    // Set proper version for engagement counters for this request.
    data.skipReason = SkipReason.NOT_SKIPPED;
    data.luceneScore = luceneQueryScore;
    data.userRep = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION);

    if (antiGamingFilter != null && !antiGamingFilter.accept(getCurrentDocID())) {
      data.skipReason = SkipReason.ANTIGAMING;
      return data;
    }

    data.textScore = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.TEXT_SCORE);
    data.tokenAt140DividedByNumTokensBucket = VISIBLE_TOKEN_RATIO_NORMALIZER.denormalize(
        (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO));
    data.fromUserId = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF);
    data.isFollow = followFilter != null
        && followFilter.contains(Longs.toByteArray(data.fromUserId));
    data.isTrusted = trustedFilter != null
        && trustedFilter.contains(Longs.toByteArray(data.fromUserId));
    data.isFromVerifiedAccount = documentFeatures.isFlagSet(
        EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG);
    data.isFromBlueVerifiedAccount = documentFeatures.isFlagSet(
        EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG);
    data.isSelfTweet = data.fromUserId == params.searcherId;
    // v1 engagement counters, note that the first three values are post-log2 version
    // of the original unnormalized values.
    data.retweetCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.RETWEET_COUNT);
    data.replyCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.REPLY_COUNT);
    data.favCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.FAVORITE_COUNT);
    data.embedsImpressionCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT);
    data.embedsUrlCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EMBEDS_URL_COUNT);
    data.videoViewCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.VIDEO_VIEW_COUNT);
    // v2 engagement counters
    data.retweetCountV2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.RETWEET_COUNT_V2);
    data.replyCountV2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.REPLY_COUNT_V2);
    data.favCountV2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.FAVORITE_COUNT_V2);
    // other v2 engagement counters
    data.embedsImpressionCountV2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2);
    data.embedsUrlCountV2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2);
    data.videoViewCountV2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2);
    // pure v2 engagement counters without v1 counterpart
    data.quotedCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.QUOTE_COUNT);
    data.weightedRetweetCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT);
    data.weightedReplyCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT);
    data.weightedFavCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT);
    data.weightedQuoteCount = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT);

    Double querySpecificScoreAdjustment = params.querySpecificScoreAdjustments == null ? null
        : params.querySpecificScoreAdjustments.get(tweetIDMapper.getTweetID(getCurrentDocID()));
    data.querySpecificScore =
        querySpecificScoreAdjustment == null ? 0.0 : querySpecificScoreAdjustment;

    data.authorSpecificScore = params.authorSpecificScoreAdjustments == null
        ? 0.0
        : params.authorSpecificScoreAdjustments.getOrDefault(data.fromUserId, 0.0);

    // respect social filter type
    if (params.socialFilterType != null && !data.isSelfTweet) {
      if ((params.socialFilterType == ThriftSocialFilterType.ALL
              && !data.isFollow && !data.isTrusted)
          || (params.socialFilterType == ThriftSocialFilterType.TRUSTED && !data.isTrusted)
          || (params.socialFilterType == ThriftSocialFilterType.FOLLOWS && !data.isFollow)) {
        // we can skip this hit as we only want social results in this mode.
        data.skipReason = SkipReason.SOCIAL_FILTER;
        return data;
      }
    }

    // 1. first apply all the filters to only non-follow tweets and non-verified accounts,
    //    but be tender to sentinel values
    // unless you specifically asked to apply filters regardless
    if (params.applyFiltersAlways
            || (!data.isSelfTweet && !data.isFollow && !data.isFromVerifiedAccount
                && !data.isFromBlueVerifiedAccount)) {
      if (data.userRep < params.reputationMinVal
          // don't filter unset userreps, we give them the benefit of doubt and let it
          // continue to scoring. userrep is unset when either user just signed up or
          // during ingestion time we had trouble getting userrep from reputation service.
          && data.userRep != RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
        data.skipReason = SkipReason.LOW_REPUTATION;
        return data;
      } else if (data.textScore < params.textScoreMinVal
                 // don't filter unset text scores, use goodwill value
                 && data.textScore != RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL) {
        data.skipReason = SkipReason.LOW_TEXT_SCORE;
        return data;
      } else if (data.retweetCountPostLog2 != LinearScoringData.UNSET_SIGNAL_VALUE
                 && data.retweetCountPostLog2 < params.retweetMinVal) {
        data.skipReason = SkipReason.LOW_RETWEET_COUNT;
        return data;
      } else if (data.favCountPostLog2 != LinearScoringData.UNSET_SIGNAL_VALUE
                 && data.favCountPostLog2 < params.favMinVal) {
        data.skipReason = SkipReason.LOW_FAV_COUNT;
        return data;
      }
    }

    // if sentinel value is set, assume goodwill score and let scoring continue.
    if (data.textScore == RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL) {
      data.textScore = RelevanceSignalConstants.GOODWILL_TEXT_SCORE;
    }
    if (data.userRep == RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
      data.userRep = RelevanceSignalConstants.GOODWILL_REPUTATION;
    }

    data.tweetAgeInSeconds = now - timeMapper.getTime(getCurrentDocID());
    if (data.tweetAgeInSeconds < 0) {
      data.tweetAgeInSeconds = 0; // Age cannot be negative
    }

    // The PARUS_SCORE feature should be read as is.
    data.parusScore = documentFeatures.getFeatureValue(EarlybirdFieldConstant.PARUS_SCORE);

    data.isNullcast = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG);
    data.hasUrl =  documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_LINK_FLAG);
    data.hasImageUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
    data.hasVideoUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG);
    data.hasNewsUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG);
    data.isReply =  documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_REPLY_FLAG);
    data.isRetweet = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_RETWEET_FLAG);
    data.isOffensive = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG);
    data.hasTrend = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_TREND_FLAG);
    data.hasMultipleHashtagsOrTrends =
        documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG);
    data.isUserSpam = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_SPAM_FLAG);
    data.isUserNSFW = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_NSFW_FLAG)
        || userTable.isSet(data.fromUserId, UserTable.NSFW_BIT);
    data.isUserAntiSocial =
        userTable.isSet(data.fromUserId, UserTable.ANTISOCIAL_BIT);
    data.isUserBot = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_BOT_FLAG);
    data.hasCard = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_CARD_FLAG);
    data.cardType = SearchCardType.UNKNOWN.getByteValue();
    if (data.hasCard) {
      data.cardType =
          (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD);
    }
    data.hasVisibleLink = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG);

    data.hasConsumerVideo =
        documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG);
    data.hasProVideo = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG);
    data.hasVine = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VINE_FLAG);
    data.hasPeriscope = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_PERISCOPE_FLAG);
    data.hasNativeImage = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG);
    data.hasQuote = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_QUOTE_FLAG);
    data.isComposerSourceCamera =
        documentFeatures.isFlagSet(EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG);

    // Only read the shared status if the isRetweet or isReply bit is true (minor optimization).
    if (data.isRetweet || (params.getInReplyToStatusId && data.isReply)) {
      data.sharedStatusId =
          documentFeatures.getFeatureValue(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF);
    }

    // Only read the reference tweet author ID if the isRetweet or isReply bit
    // is true (minor optimization).
    if (data.isRetweet || data.isReply) {
      // the REFERENCE_AUTHOR_ID_CSF stores the source tweet author id for all retweets
      long referenceAuthorId =
          documentFeatures.getFeatureValue(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF);
      if (referenceAuthorId > 0) {
        data.referenceAuthorId = referenceAuthorId;
      } else {
        // we also store the reference author id for retweets, directed at tweets, and self threaded
        // tweets separately on Realtime/Protected Earlybirds. This data will be moved to the
        // REFERENCE_AUTHOR_ID_CSF and these fields will be deprecated in SEARCH-34958.
        referenceAuthorId = LongIntConverter.convertTwoIntToOneLong(
            (int) documentFeatures.getFeatureValue(
                EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT),
            (int) documentFeatures.getFeatureValue(
                EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT));
        if (referenceAuthorId > 0) {
          data.referenceAuthorId = referenceAuthorId;
        }
      }
    }

    // Convert language to a thrift language and then back to an int in order to
    // ensure a value compatible with our current ThriftLanguage definition.
    ThriftLanguage tweetLang = ThriftLanguageUtil.safeFindByValue(
        (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE));
    data.tweetLangId = tweetLang.getValue();
    // Set the language-related features here so that they can be later used in promotion/demotion
    // and also be transferred to ThriftSearchResultMetadata
    data.userLangMult = computeUserLangMultiplier(data, params);
    data.hasDifferentLang = params.uiLangId != ThriftLanguage.UNKNOWN.getValue()
        && params.uiLangId != data.tweetLangId;
    data.hasEnglishTweetAndDifferentUILang = data.hasDifferentLang
        && data.tweetLangId == ThriftLanguage.ENGLISH.getValue();
    data.hasEnglishUIAndDifferentTweetLang = data.hasDifferentLang
        && params.uiLangId == ThriftLanguage.ENGLISH.getValue();

    // Exposed all these features for the clients.
    data.isSensitiveContent =
        documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT);
    data.hasMultipleMediaFlag =
        documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG);
    data.profileIsEggFlag = documentFeatures.isFlagSet(EarlybirdFieldConstant.PROFILE_IS_EGG_FLAG);
    data.isUserNewFlag = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_NEW_FLAG);
    data.numMentions = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.NUM_MENTIONS);
    data.numHashtags = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.NUM_HASHTAGS);
    data.linkLanguage =
        (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LINK_LANGUAGE);
    data.prevUserTweetEngagement =
        (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.PREV_USER_TWEET_ENGAGEMENT);

    // health model scores by HML
    data.toxicityScore = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.TOXICITY_SCORE);
    data.pBlockScore = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.PBLOCK_SCORE);
    data.pSpammyTweetScore = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.P_SPAMMY_TWEET_SCORE);
    data.pReportedTweetScore = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.P_REPORTED_TWEET_SCORE);
    data.spammyTweetContentScore = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.SPAMMY_TWEET_CONTENT_SCORE
    );
    data.experimentalHealthModelScore1 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_1);
    data.experimentalHealthModelScore2 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_2);
    data.experimentalHealthModelScore3 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_3);
    data.experimentalHealthModelScore4 = documentFeatures.getUnnormalizedFeatureValue(
        EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_4);

    return data;
  }