in java/src/main/java/com/twitter/twittertext/TwitterTextParser.java [83:183]
private static TwitterTextParseResults parseTweet(@Nullable final String tweet,
@Nonnull final TwitterTextConfiguration config,
boolean extractURLs) {
if (tweet == null || tweet.trim().length() == 0) {
return EMPTY_TWITTER_TEXT_PARSE_RESULTS;
}
final String normalizedTweet = Normalizer.normalize(tweet, Normalizer.Form.NFC);
final int tweetLength = normalizedTweet.length();
if (tweetLength == 0) {
return EMPTY_TWITTER_TEXT_PARSE_RESULTS;
}
final int scale = config.getScale();
final int maxWeightedTweetLength = config.getMaxWeightedTweetLength();
final int scaledMaxWeightedTweetLength = maxWeightedTweetLength * scale;
final int transformedUrlWeight = config.getTransformedURLLength() * scale;
final List<TwitterTextWeightedRange> ranges = config.getRanges();
final List<Extractor.Entity> urlEntities = EXTRACTOR.extractURLsWithIndices(normalizedTweet);
boolean hasInvalidCharacters = false;
int weightedCount = 0;
int offset = 0;
int validOffset = 0;
final Map<Integer, Integer> emojiMap = new HashMap<>();
if (config.getEmojiParsingEnabled()) {
final Matcher emojiMatcher = TwitterTextEmojiRegex.VALID_EMOJI_PATTERN
.matcher(normalizedTweet);
while (emojiMatcher.find()) {
final int start = emojiMatcher.start();
final int end = emojiMatcher.end();
emojiMap.put(start, end - start);
}
}
while (offset < tweetLength) {
int charWeight = config.getDefaultWeight();
if (extractURLs) {
final ListIterator<Extractor.Entity> urlEntityIterator = urlEntities.listIterator();
while (urlEntityIterator.hasNext()) {
final Extractor.Entity urlEntity = urlEntityIterator.next();
if (urlEntity.start == offset) {
final int urlLength = urlEntity.end - urlEntity.start;
weightedCount += transformedUrlWeight;
offset += urlLength;
if (weightedCount <= scaledMaxWeightedTweetLength) {
validOffset += urlLength;
}
urlEntityIterator.remove();
break;
}
}
}
if (offset < tweetLength) {
final int codePoint = normalizedTweet.codePointAt(offset);
int emojiLength = -1;
if (emojiMap.containsKey(offset)) {
charWeight = config.getDefaultWeight();
emojiLength = emojiMap.get(offset);
}
if (emojiLength == -1) {
for (final TwitterTextWeightedRange weightedRange : ranges) {
if (weightedRange.getRange().isInRange(codePoint)) {
charWeight = weightedRange.getWeight();
break;
}
}
}
weightedCount += charWeight;
hasInvalidCharacters = hasInvalidCharacters ||
Validator.hasInvalidCharacters(normalizedTweet.substring(offset, offset + 1));
final int offsetDelta;
if (emojiLength != -1) {
offsetDelta = emojiLength;
} else {
offsetDelta = Character.charCount(codePoint);
}
offset += offsetDelta;
if (!hasInvalidCharacters && weightedCount <= scaledMaxWeightedTweetLength) {
validOffset += offsetDelta;
}
}
}
final int normalizedTweetOffset = tweet.length() - normalizedTweet.length();
final int scaledWeightedLength = weightedCount / scale;
final boolean isValid = !hasInvalidCharacters && scaledWeightedLength <= maxWeightedTweetLength;
final int permillage = scaledWeightedLength * 1000 / maxWeightedTweetLength;
return new TwitterTextParseResults(scaledWeightedLength, permillage, isValid,
new Range(0, offset + normalizedTweetOffset - 1),
new Range(0, validOffset + normalizedTweetOffset - 1));
}