in rb/lib/twitter-text/extractor.rb [205:258]
def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true})
return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
urls = []
text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
valid_url_match_data = $~
start_position = valid_url_match_data.char_begin(3)
end_position = valid_url_match_data.char_end(3)
if !protocol
next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars]
last_url = nil
domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain|
next unless is_valid_domain(url.length, ascii_domain, protocol)
last_url = {
:url => ascii_domain,
:indices => [start_position + $~.char_begin(0),
start_position + $~.char_end(0)]
}
urls << last_url
end
next unless last_url
if path
last_url[:url] = url.sub(domain, last_url[:url])
last_url[:indices][1] = end_position
end
else
if url =~ Twitter::TwitterText::Regex[:valid_tco_url]
next if $1 && $1.length > MAX_TCO_SLUG_LENGTH
url = $&
end_position = start_position + url.codepoint_length
end
next unless is_valid_domain(url.length, domain, protocol)
urls << {
:url => url,
:indices => [start_position, end_position]
}
end
end
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
urls
end