in java/src/main/java/com/twitter/twittertext/Extractor.java [327:373]
public List<Entity> extractURLsWithIndices(@Nullable String text) {
if (isEmptyString(text) ||
(extractURLWithoutProtocol ? text.indexOf('.') : text.indexOf(':')) == -1) {
// Performance optimization.
// If text doesn't contain '.' or ':' at all, text doesn't contain URL,
// so we can simply return an empty list.
return Collections.emptyList();
}
final List<Entity> urls = new ArrayList<>();
final Matcher matcher = Regex.VALID_URL.matcher(text);
while (matcher.find()) {
final String protocol = matcher.group(Regex.VALID_URL_GROUP_PROTOCOL);
if (isEmptyString(protocol)) {
// skip if protocol is not present and 'extractURLWithoutProtocol' is false
// or URL is preceded by invalid character.
if (!extractURLWithoutProtocol
|| Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN
.matcher(matcher.group(Regex.VALID_URL_GROUP_BEFORE)).matches()) {
continue;
}
}
String url = matcher.group(Regex.VALID_URL_GROUP_URL);
int start = matcher.start(Regex.VALID_URL_GROUP_URL);
int end = matcher.end(Regex.VALID_URL_GROUP_URL);
final Matcher tcoMatcher = Regex.VALID_TCO_URL.matcher(url);
if (tcoMatcher.find()) {
final String tcoUrl = tcoMatcher.group(0);
final String tcoUrlSlug = tcoMatcher.group(1);
// In the case of t.co URLs, don't allow additional path characters and
// ensure that the slug is under 40 chars.
if (tcoUrlSlug.length() > MAX_TCO_SLUG_LENGTH) {
continue;
} else {
url = tcoUrl;
end = start + url.length();
}
}
final String host = matcher.group(Regex.VALID_URL_GROUP_DOMAIN);
if (isValidHostAndLength(url.length(), protocol, host)) {
urls.add(new Entity(start, end, url, Entity.Type.URL));
}
}
return urls;
}