public List extractURLsWithIndices()

in java/src/main/java/com/twitter/twittertext/Extractor.java [327:373]


  public List<Entity> extractURLsWithIndices(@Nullable String text) {
    if (isEmptyString(text) ||
        (extractURLWithoutProtocol ? text.indexOf('.') : text.indexOf(':')) == -1) {
      // Performance optimization.
      // If text doesn't contain '.' or ':' at all, text doesn't contain URL,
      // so we can simply return an empty list.
      return Collections.emptyList();
    }

    final List<Entity> urls = new ArrayList<>();

    final Matcher matcher = Regex.VALID_URL.matcher(text);
    while (matcher.find()) {
      final String protocol = matcher.group(Regex.VALID_URL_GROUP_PROTOCOL);
      if (isEmptyString(protocol)) {
        // skip if protocol is not present and 'extractURLWithoutProtocol' is false
        // or URL is preceded by invalid character.
        if (!extractURLWithoutProtocol
            || Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN
            .matcher(matcher.group(Regex.VALID_URL_GROUP_BEFORE)).matches()) {
          continue;
        }
      }
      String url = matcher.group(Regex.VALID_URL_GROUP_URL);
      int start = matcher.start(Regex.VALID_URL_GROUP_URL);
      int end = matcher.end(Regex.VALID_URL_GROUP_URL);
      final Matcher tcoMatcher = Regex.VALID_TCO_URL.matcher(url);
      if (tcoMatcher.find()) {
        final String tcoUrl = tcoMatcher.group(0);
        final String tcoUrlSlug = tcoMatcher.group(1);
        // In the case of t.co URLs, don't allow additional path characters and
        // ensure that the slug is under 40 chars.
        if (tcoUrlSlug.length() > MAX_TCO_SLUG_LENGTH) {
          continue;
        } else {
          url = tcoUrl;
          end = start + url.length();
        }
      }
      final String host = matcher.group(Regex.VALID_URL_GROUP_DOMAIN);
      if (isValidHostAndLength(url.length(), protocol, host)) {
        urls.add(new Entity(start, end, url, Entity.Type.URL));
      }
    }

    return urls;
  }