in src/java/com/twitter/search/earlybird/index/EarlybirdSingleSegmentSearcher.java [312:408]
private void fillTermMetadata(Term term, ThriftFacetCountMetadata metadata,
FacetLabelProvider.FacetLabelAccessor photoAccessor,
byte debugMode) throws IOException {
boolean isTwimg = term.field().equals(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName());
int internalDocID = DocIDToTweetIDMapper.ID_NOT_FOUND;
long statusID = -1;
long userID = -1;
Term facetTerm = term;
// Deal with the from_user_id facet.
if (term.field().equals(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName())) {
userID = Long.parseLong(term.text());
facetTerm = new Term(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(),
LongTermAttributeImpl.copyIntoNewBytesRef(userID));
} else if (isTwimg) {
statusID = Long.parseLong(term.text());
internalDocID = twitterReader.getSegmentData().getDocIDToTweetIDMapper().getDocID(statusID);
}
if (internalDocID == DocIDToTweetIDMapper.ID_NOT_FOUND) {
// If this is not a twimg, this is how statusID should be looked up
//
// If this is a twimg but we couldn't find the internalDocID, that means this segment,
// or maybe even this earlybird, does not contain the original tweet. Then we treat this as
// a normal facet for now
internalDocID = twitterReader.getOldestDocID(facetTerm);
if (internalDocID >= 0) {
statusID =
twitterReader.getSegmentData().getDocIDToTweetIDMapper().getTweetID(internalDocID);
} else {
statusID = -1;
}
}
// make sure tweet is not deleted
if (internalDocID < 0 || twitterReader.getDeletesView().isDeleted(internalDocID)) {
return;
}
if (metadata.isSetStatusId()
&& metadata.getStatusId() > 0
&& metadata.getStatusId() <= statusID) {
// we already have the metadata for this facet from an earlier tweet
return;
}
// now check if this tweet is offensive, e.g. antisocial, nsfw, sensitive
EarlybirdDocumentFeatures documentFeatures = new EarlybirdDocumentFeatures(twitterReader);
documentFeatures.advance(internalDocID);
boolean isOffensiveFlagSet =
documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG);
boolean isSensitiveFlagSet =
documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT);
boolean offensive = isOffensiveFlagSet || isSensitiveFlagSet;
// also, user should not be marked as antisocial, nsfw or offensive
if (userID < 0) {
userID = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF);
}
offensive |= userTable.isSet(userID,
UserTable.ANTISOCIAL_BIT
| UserTable.OFFENSIVE_BIT
| UserTable.NSFW_BIT);
metadata.setStatusId(statusID);
metadata.setTwitterUserId(userID);
metadata.setCreated_at(twitterReader.getSegmentData().getTimeMapper().getTime(internalDocID));
int langId = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE);
Locale lang = ThriftLanguageUtil.getLocaleOf(ThriftLanguage.findByValue(langId));
metadata.setStatusLanguage(ThriftLanguageUtil.getThriftLanguageOf(lang));
metadata.setStatusPossiblySensitive(offensive);
if (isTwimg && photoAccessor != null && !metadata.isSetNativePhotoUrl()) {
int termID = twitterReader.getTermID(term);
if (termID != EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) {
BytesRef termPayload = photoAccessor.getTermPayload(termID);
if (termPayload != null) {
metadata.setNativePhotoUrl(termPayload.utf8ToString());
}
}
}
if (debugMode > 3) {
StringBuilder sb = new StringBuilder(256);
if (metadata.isSetExplanation()) {
sb.append(metadata.getExplanation());
}
sb.append(String.format("TweetId=%d (%s %s), UserId=%d (%s %s), Term=%s\n",
statusID,
isOffensiveFlagSet ? "OFFENSIVE" : "",
isSensitiveFlagSet ? "SENSITIVE" : "",
userID,
userTable.isSet(userID, UserTable.ANTISOCIAL_BIT) ? "ANTISOCIAL" : "",
userTable.isSet(userID, UserTable.NSFW_BIT) ? "NSFW" : "",
term.toString()));
metadata.setExplanation(sb.toString());
}
}