in java/src/main/java/com/twitter/twittertext/Extractor.java [450:498]
private List<Entity> extractHashtagsWithIndices(String text, boolean checkUrlOverlap) {
if (isEmptyString(text)) {
return Collections.emptyList();
}
// Performance optimization.
// If text doesn't contain #/# at all, text doesn't contain
// hashtag, so we can simply return an empty list.
boolean found = false;
for (char c : text.toCharArray()) {
if (c == '#' || c == '#') {
found = true;
break;
}
}
if (!found) {
return Collections.emptyList();
}
List<Entity> extracted = new ArrayList<Entity>();
Matcher matcher = Regex.VALID_HASHTAG.matcher(text);
while (matcher.find()) {
String after = text.substring(matcher.end());
if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) {
extracted.add(new Entity(matcher, Entity.Type.HASHTAG, Regex.VALID_HASHTAG_GROUP_TAG));
}
}
if (checkUrlOverlap) {
// extract URLs
List<Entity> urls = extractURLsWithIndices(text);
if (!urls.isEmpty()) {
extracted.addAll(urls);
// remove overlap
removeOverlappingEntities(extracted);
// remove URL entities
Iterator<Entity> it = extracted.iterator();
while (it.hasNext()) {
Entity entity = it.next();
if (entity.getType() != Entity.Type.HASHTAG) {
it.remove();
}
}
}
}
return extracted;
}