js/src/extractUrlsWithIndices.js (75 lines of code) (raw):
// Copyright 2018 Twitter, Inc.
// Licensed under the Apache License, Version 2.0
// http://www.apache.org/licenses/LICENSE-2.0
import extractUrl from './regexp/extractUrl';
import invalidUrlWithoutProtocolPrecedingChars from './regexp/invalidUrlWithoutProtocolPrecedingChars';
import idna from './lib/idna';
import validAsciiDomain from './regexp/validAsciiDomain';
import validTcoUrl from './regexp/validTcoUrl';
const DEFAULT_PROTOCOL = 'https://';
const DEFAULT_PROTOCOL_OPTIONS = { extractUrlsWithoutProtocol: true };
const MAX_URL_LENGTH = 4096;
const MAX_TCO_SLUG_LENGTH = 40;
const extractUrlsWithIndices = function(text, options = DEFAULT_PROTOCOL_OPTIONS) {
if (!text || (options.extractUrlsWithoutProtocol ? !text.match(/\./) : !text.match(/:/))) {
return [];
}
const urls = [];
while (extractUrl.exec(text)) {
const before = RegExp.$2;
let url = RegExp.$3;
const protocol = RegExp.$4;
const domain = RegExp.$5;
const path = RegExp.$7;
let endPosition = extractUrl.lastIndex;
const startPosition = endPosition - url.length;
if (!isValidUrl(url, protocol || DEFAULT_PROTOCOL, domain)) {
continue;
}
// extract ASCII-only domains.
if (!protocol) {
if (!options.extractUrlsWithoutProtocol || before.match(invalidUrlWithoutProtocolPrecedingChars)) {
continue;
}
let lastUrl = null;
let asciiEndPosition = 0;
domain.replace(validAsciiDomain, function(asciiDomain) {
const asciiStartPosition = domain.indexOf(asciiDomain, asciiEndPosition);
asciiEndPosition = asciiStartPosition + asciiDomain.length;
lastUrl = {
url: asciiDomain,
indices: [startPosition + asciiStartPosition, startPosition + asciiEndPosition]
};
urls.push(lastUrl);
});
// no ASCII-only domain found. Skip the entire URL.
if (lastUrl == null) {
continue;
}
// lastUrl only contains domain. Need to add path and query if they exist.
if (path) {
lastUrl.url = url.replace(domain, lastUrl.url);
lastUrl.indices[1] = endPosition;
}
} else {
// In the case of t.co URLs, don't allow additional path characters.
if (url.match(validTcoUrl)) {
const tcoUrlSlug = RegExp.$1;
if (tcoUrlSlug && tcoUrlSlug.length > MAX_TCO_SLUG_LENGTH) {
continue;
} else {
url = RegExp.lastMatch;
endPosition = startPosition + url.length;
}
}
urls.push({
url: url,
indices: [startPosition, endPosition]
});
}
}
return urls;
};
const isValidUrl = function(url, protocol, domain) {
let urlLength = url.length;
const punycodeEncodedDomain = idna.toAscii(domain);
if (!punycodeEncodedDomain || !punycodeEncodedDomain.length) {
return false;
}
urlLength = urlLength + punycodeEncodedDomain.length - domain.length;
return protocol.length + urlLength <= MAX_URL_LENGTH;
};
export default extractUrlsWithIndices;