in src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaCreateTool.java [484:701]
public static ThriftSchema buildThriftSchema(EarlybirdCluster cluster) {
EarlybirdSchemaBuilder builder = new EarlybirdSchemaBuilder(
new EarlybirdFieldConstants(), cluster, TokenStreamSerializer.Version.VERSION_2);
builder.withSchemaVersion(
FlushVersion.CURRENT_FLUSH_VERSION.getVersionNumber(),
FlushVersion.CURRENT_FLUSH_VERSION.getMinorVersion(),
FlushVersion.CURRENT_FLUSH_VERSION.getDescription(),
FlushVersion.CURRENT_FLUSH_VERSION.isOfficial());
// ID field, used for partitioning
builder.withPartitionFieldId(0)
.withSortableLongTermField(EarlybirdFieldConstant.ID_FIELD.getFieldName())
// Text Fields that are searched by default
.withTextField(EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), true)
.withSearchFieldByDefault(
EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), 0.1f)
.withPretokenizedTextField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), true)
.withSearchFieldByDefault(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), 1.0f);
builder.withTweetSpecificNormalization(EarlybirdFieldConstant.TEXT_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), true)
.withSearchFieldByDefault(
EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), 0.2f)
// Text fields not searched by default
.withTextField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), false)
// cards are not searched by default, and have weight 0.
.withPretokenizedTextField(EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.CARD_LANG.getFieldName(), false)
// Out-of-order append fields
.withLongTermField(EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName())
.withLongTermField(EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName())
// No Position fields, sorted alphabetically
.withPretokenizedNoPositionField(EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName())
.withIntTermField(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(), false)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName())
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(), false)
.withTermTextLookup(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())
.withTermTextLookup(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName())
.withPretokenizedNoPositionField(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName())
.withIndexedNotTokenizedField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD)
.withIndexedNotTokenizedField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
.withIndexedNotTokenizedField(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.LINKS_FIELD.getFieldName())
.withIntTermField(EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName())
.withPretokenizedNoPositionField(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName())
.withIntTermField(NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
.withIntTermField(NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
.withIntTermField(NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
.withIntTermField(EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName())
.withLongTermField(EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName())
// Named entity fields
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD.getFieldName(), true)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD.getFieldName(), true)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(), true)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(), true)
// camelCase-tokenized user handles and tokenized user names, not searchable by default
.withPretokenizedTextField(
EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), false)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.SPACE_ADMIN_FIELD.getFieldName(), false)
.withPretokenizedTextField(EarlybirdFieldConstant.SPACE_TITLE_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), true)
.withPretokenizedTextField(
EarlybirdFieldConstant.CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.URL_DESCRIPTION_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.URL_TITLE_FIELD.getFieldName(), false);
builder
.withPhotoUrlFacetField(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName())
.withOutOfOrderEnabledForField(
EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName())
.withOutOfOrderEnabledForField(
EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName())
.withOutOfOrderEnabledForField(
EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName());
// ColumnStrideFields.
boolean loadCSFIntoRAMDefault = cluster != EarlybirdCluster.FULL_ARCHIVE;
builder
.withColumnStrideField(EarlybirdFieldConstants.ENCODED_TWEET_FEATURES_FIELD_NAME,
ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_FEATURES,
true, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true)
.withColumnStrideField(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(),
ThriftCSFType.BYTE, 1, false, loadCSFIntoRAMDefault)
// CSF Used by archive mappers
.withColumnStrideField(EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName(),
ThriftCSFType.INT, 1, false, /* the full archive loads this field into RAM */ true)
.withColumnStrideField(EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName(),
ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true)
.withColumnStrideField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.QUOTED_USER_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(),
ThriftCSFType.INT, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CARD_URI_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(
EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
/* Semicolon on separate line to preserve git blame. */;
builder.withColumnStrideField(
EarlybirdFieldConstants.EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME,
ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_EXTENDED_FEATURES,
true, loadCSFIntoRAMDefault);
for (Map.Entry<String, FeatureConfiguration> entry : FEATURE_CONFIGURATION_MAP.entrySet()) {
String fullName = entry.getKey();
String baseName = getBaseFieldName(fullName);
EarlybirdFieldConstant fieldConstant = EarlybirdFieldConstants.getFieldConstant(fullName);
if (fieldConstant.isValidFieldInCluster(cluster)) {
builder.withFeatureConfiguration(baseName, fullName, entry.getValue());
}
}
// Add facet settings for facet fields
// boolean args are respectively whether to use skiplist, whether offensive, whether to use CSF
builder
.withFacetConfigs(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(),
EarlybirdFieldConstant.MENTIONS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(),
EarlybirdFieldConstant.HASHTAGS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(),
EarlybirdFieldConstant.STOCKS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.IMAGES_FACET, true, true, false)
.withFacetConfigs(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.VIDEOS_FACET, true, true, false)
.withFacetConfigs(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.NEWS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(),
EarlybirdFieldConstant.LANGUAGES_FACET, false, false, false)
.withFacetConfigs(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(),
EarlybirdFieldConstant.SOURCES_FACET, false, false, false)
.withFacetConfigs(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.TWIMG_FACET, true, true, false)
.withFacetConfigs(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
EarlybirdFieldConstant.FROM_USER_ID_FACET, false, false, true /* facet on CSF */)
.withFacetConfigs(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
EarlybirdFieldConstant.RETWEETS_FACET, false, false, true /* facet on CSF */)
.withFacetConfigs(EarlybirdFieldConstant.LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.LINKS_FACET, true, false, false)
.withFacetConfigs(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(),
true, false, false)
.withFacetConfigs(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(),
true, false, false)
.withFacetConfigs(
EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(),
EarlybirdFieldConstant.SPACES_FACET, true, false, false);
return builder.build();
}